diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -15,27 +15,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.587890625, + "completions/clipped_ratio": 0.626953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1698.599609375, - "completions/mean_terminated_length": 1200.1658935546875, - "completions/min_length": 20.0, - "completions/min_terminated_length": 20.0, + "completions/mean_length": 1722.69921875, + "completions/mean_terminated_length": 1175.989501953125, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, "epoch": 0.0003413843133907997, - "grad_norm": 0.10431239753961563, - "kl": 0.0006151199340820312, + "grad_norm": 0.10038339346647263, + "kl": 0.0006093978881835938, "learning_rate": 0.0, - "loss": 0.0844, - "num_tokens": 948291.0, - "reward": 0.46240234375, - "reward_std": 0.2197888195514679, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, + "loss": 0.0739, + "num_tokens": 960630.0, + "reward": 0.4541015625, + "reward_std": 0.20013213157653809, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.39208984375, - "rewards/tag_count_reward/std": 0.18950168788433075, + "rewards/tag_count_reward/mean": 0.3955078125, + "rewards/tag_count_reward/std": 0.19985154271125793, "step": 1 }, { @@ -44,27 +44,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.673828125, + "completions/clipped_ratio": 0.654296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 1758.5859375, - "completions/mean_terminated_length": 1160.6947021484375, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1727.654296875, + "completions/mean_terminated_length": 1121.350341796875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, "epoch": 0.0006827686267815994, - "grad_norm": 0.09877148270606995, - "kl": 0.0005578994750976562, - "learning_rate": 3.424657534246575e-09, - "loss": 0.0661, - "num_tokens": 1921919.0, - "reward": 0.3974609375, - "reward_std": 0.1725420355796814, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, + "grad_norm": 0.1154610738158226, + "kl": 0.000568389892578125, + "learning_rate": 3.4129692832764506e-09, + "loss": 0.07, + "num_tokens": 1918421.0, + "reward": 0.40283203125, + "reward_std": 0.16966792941093445, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.3583984375, - "rewards/tag_count_reward/std": 0.15551748871803284, + "rewards/tag_count_reward/mean": 0.35986328125, + "rewards/tag_count_reward/std": 0.15087929368019104, "step": 2 }, { @@ -73,27 +73,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.529296875, + "completions/clipped_ratio": 0.560546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 1627.919921875, - "completions/mean_terminated_length": 1155.5477294921875, - "completions/min_length": 20.0, - "completions/min_terminated_length": 20.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1669.453125, + "completions/mean_terminated_length": 1186.5955810546875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, "epoch": 0.001024152940172399, - "grad_norm": 0.092210553586483, - "kl": 0.0005474090576171875, - "learning_rate": 6.84931506849315e-09, - "loss": 0.074, - "num_tokens": 2837286.0, - "reward": 0.50048828125, - "reward_std": 0.21651187539100647, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, + "grad_norm": 0.08359593898057938, + "kl": 0.0005421638488769531, + "learning_rate": 6.825938566552901e-09, + "loss": 0.0669, + "num_tokens": 2855053.0, + "reward": 0.47509765625, + "reward_std": 0.2010759711265564, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.40869140625, - "rewards/tag_count_reward/std": 0.18128602206707, + "rewards/tag_count_reward/mean": 0.39111328125, + "rewards/tag_count_reward/std": 0.16621176898479462, "step": 3 }, { @@ -102,27 +102,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.673828125, + "completions/clipped_ratio": 0.65234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1769.169921875, - "completions/mean_terminated_length": 1193.143798828125, - "completions/min_length": 32.0, - "completions/min_terminated_length": 32.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1751.20703125, + "completions/mean_terminated_length": 1194.3033447265625, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, "epoch": 0.0013655372535631989, - "grad_norm": 0.09093520045280457, - "kl": 0.000579833984375, - "learning_rate": 1.0273972602739724e-08, - "loss": 0.0932, - "num_tokens": 3823501.0, - "reward": 0.38232421875, - "reward_std": 0.1507340669631958, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, + "grad_norm": 0.09588006138801575, + "kl": 0.0006213188171386719, + "learning_rate": 1.023890784982935e-08, + "loss": 0.0753, + "num_tokens": 3832071.0, + "reward": 0.39599609375, + "reward_std": 0.14499154686927795, + "rewards/accuracy_reward/mean": 0.029296875, + "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.35107421875, - "rewards/tag_count_reward/std": 0.15216577053070068, + "rewards/tag_count_reward/mean": 0.36669921875, + "rewards/tag_count_reward/std": 0.1653124988079071, "step": 4 }, { @@ -131,27 +131,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.51171875, + "completions/clipped_ratio": 0.541015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1616.763671875, - "completions/mean_terminated_length": 1164.8280029296875, - "completions/min_length": 231.0, - "completions/min_terminated_length": 231.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1612.12109375, + "completions/mean_terminated_length": 1098.34033203125, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.0017069215669539984, - "grad_norm": 0.11474630981683731, - "kl": 0.0005540847778320312, - "learning_rate": 1.36986301369863e-08, - "loss": 0.0551, - "num_tokens": 4732788.0, - "reward": 0.5478515625, - "reward_std": 0.23933620750904083, - "rewards/accuracy_reward/mean": 0.14314515888690948, - "rewards/accuracy_reward/std": 0.35057440400123596, + "grad_norm": 0.1058402881026268, + "kl": 0.0005502700805664062, + "learning_rate": 1.3651877133105802e-08, + "loss": 0.0732, + "num_tokens": 4738981.0, + "reward": 0.52099609375, + "reward_std": 0.24503561854362488, + "rewards/accuracy_reward/mean": 0.12096773833036423, + "rewards/accuracy_reward/std": 0.32641899585723877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.4091796875, - "rewards/tag_count_reward/std": 0.17640604078769684, + "rewards/tag_count_reward/mean": 0.40380859375, + "rewards/tag_count_reward/std": 0.170328751206398, "step": 5 }, { @@ -160,27 +160,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.5546875, + "completions/clipped_ratio": 0.55859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1634.732421875, - "completions/mean_terminated_length": 1119.9605712890625, - "completions/min_length": 66.0, - "completions/min_terminated_length": 66.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1597.671875, + "completions/mean_terminated_length": 1027.78759765625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, "epoch": 0.002048305880344798, - "grad_norm": 0.11417374759912491, - "kl": 0.00060272216796875, - "learning_rate": 1.7123287671232876e-08, - "loss": 0.0843, - "num_tokens": 5657051.0, - "reward": 0.4462890625, - "reward_std": 0.17480555176734924, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, + "grad_norm": 0.11257678270339966, + "kl": 0.0005893707275390625, + "learning_rate": 1.706484641638225e-08, + "loss": 0.112, + "num_tokens": 5644269.0, + "reward": 0.462890625, + "reward_std": 0.20636704564094543, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.3837890625, - "rewards/tag_count_reward/std": 0.1545807421207428, + "rewards/tag_count_reward/mean": 0.388671875, + "rewards/tag_count_reward/std": 0.17220963537693024, "step": 6 }, { @@ -189,27 +189,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.603515625, + "completions/clipped_ratio": 0.65234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1716.001953125, - "completions/mean_terminated_length": 1210.645263671875, - "completions/min_length": 218.0, - "completions/min_terminated_length": 218.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1745.85546875, + "completions/mean_terminated_length": 1178.91015625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.0023896901937355977, - "grad_norm": 0.10724660009145737, - "kl": 0.0005521774291992188, - "learning_rate": 2.054794520547945e-08, - "loss": 0.0957, - "num_tokens": 6608028.0, - "reward": 0.4287109375, - "reward_std": 0.19955545663833618, - "rewards/accuracy_reward/mean": 0.04233871027827263, - "rewards/accuracy_reward/std": 0.2015640139579773, + "grad_norm": 0.09491042047739029, + "kl": 0.0005645751953125, + "learning_rate": 2.04778156996587e-08, + "loss": 0.0996, + "num_tokens": 6610531.0, + "reward": 0.3984375, + "reward_std": 0.1375826597213745, + "rewards/accuracy_reward/mean": 0.032258063554763794, + "rewards/accuracy_reward/std": 0.17686307430267334, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.3876953125, - "rewards/tag_count_reward/std": 0.1806028187274933, + "rewards/tag_count_reward/mean": 0.3671875, + "rewards/tag_count_reward/std": 0.15542221069335938, "step": 7 }, { @@ -218,27 +218,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.55078125, + "completions/clipped_ratio": 0.564453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1729.3828125, - "completions/mean_terminated_length": 1338.7303466796875, - "completions/min_length": 310.0, - "completions/min_terminated_length": 310.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1722.056640625, + "completions/mean_terminated_length": 1299.645751953125, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, "epoch": 0.0027310745071263977, - "grad_norm": 0.09726861119270325, - "kl": 0.00057220458984375, - "learning_rate": 2.3972602739726024e-08, - "loss": 0.0874, - "num_tokens": 7570176.0, - "reward": 0.5234375, - "reward_std": 0.27367502450942993, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, + "grad_norm": 0.0939500480890274, + "kl": 0.0005664825439453125, + "learning_rate": 2.3890784982935154e-08, + "loss": 0.1146, + "num_tokens": 7568928.0, + "reward": 0.51806640625, + "reward_std": 0.2929551601409912, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.396484375, - "rewards/tag_count_reward/std": 0.16633892059326172, + "rewards/tag_count_reward/mean": 0.39892578125, + "rewards/tag_count_reward/std": 0.1703680157661438, "step": 8 }, { @@ -247,27 +247,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.556640625, + "completions/clipped_ratio": 0.5546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 1642.70703125, - "completions/mean_terminated_length": 1133.8590087890625, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1643.587890625, + "completions/mean_terminated_length": 1139.8465576171875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.0030724588205171973, - "grad_norm": 0.10777005553245544, - "kl": 0.0005903244018554688, - "learning_rate": 2.73972602739726e-08, - "loss": 0.0881, - "num_tokens": 8485034.0, - "reward": 0.453125, - "reward_std": 0.20808660984039307, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, + "grad_norm": 0.11887728422880173, + "kl": 0.000579833984375, + "learning_rate": 2.7303754266211605e-08, + "loss": 0.1153, + "num_tokens": 8484237.0, + "reward": 0.4580078125, + "reward_std": 0.21570083498954773, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.404296875, - "rewards/tag_count_reward/std": 0.1943957805633545, + "rewards/tag_count_reward/mean": 0.3994140625, + "rewards/tag_count_reward/std": 0.18209920823574066, "step": 9 }, { @@ -276,27 +276,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.5859375, + "completions/clipped_ratio": 0.62890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1675.646484375, - "completions/mean_terminated_length": 1148.731201171875, - "completions/min_length": 276.0, - "completions/min_terminated_length": 276.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1729.3046875, + "completions/mean_terminated_length": 1189.2000732421875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.003413843133907997, - "grad_norm": 0.0893247202038765, - "kl": 0.0005788803100585938, - "learning_rate": 3.082191780821918e-08, - "loss": 0.089, - "num_tokens": 9414901.0, - "reward": 0.41845703125, - "reward_std": 0.1757884919643402, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, + "grad_norm": 0.09136687219142914, + "kl": 0.0005388259887695312, + "learning_rate": 3.071672354948805e-08, + "loss": 0.0744, + "num_tokens": 9441577.0, + "reward": 0.40087890625, + "reward_std": 0.168868750333786, + "rewards/accuracy_reward/mean": 0.033203125, + "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.38330078125, - "rewards/tag_count_reward/std": 0.16157081723213196, + "rewards/tag_count_reward/mean": 0.36767578125, + "rewards/tag_count_reward/std": 0.15701180696487427, "step": 10 }, { @@ -305,27 +305,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.576171875, + "completions/clipped_ratio": 0.62890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1673.0078125, - "completions/mean_terminated_length": 1163.225830078125, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1738.44921875, + "completions/mean_terminated_length": 1213.8421630859375, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, "epoch": 0.003755227447298797, - "grad_norm": 0.10025846213102341, - "kl": 0.0005784034729003906, - "learning_rate": 3.424657534246575e-08, - "loss": 0.0832, - "num_tokens": 10347065.0, - "reward": 0.431640625, - "reward_std": 0.19180549681186676, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, + "grad_norm": 0.09358195215463638, + "kl": 0.0005154609680175781, + "learning_rate": 3.41296928327645e-08, + "loss": 0.0806, + "num_tokens": 10407247.0, + "reward": 0.42236328125, + "reward_std": 0.21011707186698914, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.39453125, - "rewards/tag_count_reward/std": 0.17862646281719208, + "rewards/tag_count_reward/mean": 0.36181640625, + "rewards/tag_count_reward/std": 0.14695879817008972, "step": 11 }, { @@ -334,27 +334,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.517578125, + "completions/clipped_ratio": 0.568359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 1657.361328125, - "completions/mean_terminated_length": 1238.255126953125, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1708.470703125, + "completions/mean_terminated_length": 1261.398193359375, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, "epoch": 0.004096611760689596, - "grad_norm": 0.09967684745788574, - "kl": 0.0006275177001953125, - "learning_rate": 3.767123287671233e-08, - "loss": 0.1118, - "num_tokens": 11267010.0, - "reward": 0.50390625, - "reward_std": 0.24697786569595337, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, + "grad_norm": 0.08788277208805084, + "kl": 0.00052642822265625, + "learning_rate": 3.754266211604096e-08, + "loss": 0.0779, + "num_tokens": 11353360.0, + "reward": 0.46337890625, + "reward_std": 0.21695996820926666, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.41015625, - "rewards/tag_count_reward/std": 0.17411890625953674, + "rewards/tag_count_reward/mean": 0.38330078125, + "rewards/tag_count_reward/std": 0.15928363800048828, "step": 12 }, { @@ -363,27 +363,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.576171875, + "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 1611.318359375, - "completions/mean_terminated_length": 1017.6727905273438, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1668.15625, + "completions/mean_terminated_length": 1035.0833740234375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.004437996074080396, - "grad_norm": 0.11828067153692245, - "kl": 0.0007390975952148438, - "learning_rate": 4.10958904109589e-08, - "loss": 0.0974, - "num_tokens": 12167813.0, - "reward": 0.474609375, - "reward_std": 0.20802630484104156, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, + "grad_norm": 0.10812373459339142, + "kl": 0.0005941390991210938, + "learning_rate": 4.09556313993174e-08, + "loss": 0.0842, + "num_tokens": 12283264.0, + "reward": 0.46923828125, + "reward_std": 0.23663829267024994, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.39453125, - "rewards/tag_count_reward/std": 0.19436629116535187, + "rewards/tag_count_reward/mean": 0.36962890625, + "rewards/tag_count_reward/std": 0.17266887426376343, "step": 13 }, { @@ -392,27 +392,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.51171875, + "completions/clipped_ratio": 0.572265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1670.515625, - "completions/mean_terminated_length": 1274.912109375, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1729.04296875, + "completions/mean_terminated_length": 1302.3104248046875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, "epoch": 0.0047793803874711955, - "grad_norm": 0.0998254269361496, - "kl": 0.0006895065307617188, - "learning_rate": 4.452054794520547e-08, - "loss": 0.108, - "num_tokens": 13100381.0, - "reward": 0.46630859375, - "reward_std": 0.22284796833992004, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, + "grad_norm": 0.09474898874759674, + "kl": 0.0005512237548828125, + "learning_rate": 4.436860068259386e-08, + "loss": 0.1013, + "num_tokens": 13245798.0, + "reward": 0.45068359375, + "reward_std": 0.19786083698272705, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.40966796875, - "rewards/tag_count_reward/std": 0.17561545968055725, + "rewards/tag_count_reward/mean": 0.38623046875, + "rewards/tag_count_reward/std": 0.1636510044336319, "step": 14 }, { @@ -421,27 +421,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.486328125, + "completions/clipped_ratio": 0.521484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1561.16015625, - "completions/mean_terminated_length": 1100.2357177734375, - "completions/min_length": 14.0, - "completions/min_terminated_length": 14.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1600.45703125, + "completions/mean_terminated_length": 1112.7264404296875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, "epoch": 0.005120764700861995, - "grad_norm": 0.10169485211372375, - "kl": 0.0006809234619140625, - "learning_rate": 4.794520547945205e-08, - "loss": 0.1075, - "num_tokens": 13976847.0, - "reward": 0.5224609375, - "reward_std": 0.2715893089771271, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, + "grad_norm": 0.10015860199928284, + "kl": 0.0005078315734863281, + "learning_rate": 4.778156996587031e-08, + "loss": 0.1058, + "num_tokens": 14142384.0, + "reward": 0.4990234375, + "reward_std": 0.2508639097213745, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.4306640625, - "rewards/tag_count_reward/std": 0.20048168301582336, + "rewards/tag_count_reward/mean": 0.4052734375, + "rewards/tag_count_reward/std": 0.1743355393409729, "step": 15 }, { @@ -450,27 +450,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.57421875, + "completions/clipped_ratio": 0.59765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 1646.515625, - "completions/mean_terminated_length": 1105.064208984375, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1674.45703125, + "completions/mean_terminated_length": 1119.58251953125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, "epoch": 0.0054621490142527955, - "grad_norm": 0.1059543713927269, - "kl": 0.00087738037109375, - "learning_rate": 5.136986301369862e-08, - "loss": 0.0657, - "num_tokens": 14895031.0, - "reward": 0.50146484375, - "reward_std": 0.2400987446308136, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, + "grad_norm": 0.10255003720521927, + "kl": 0.0005817413330078125, + "learning_rate": 5.119453924914675e-08, + "loss": 0.1018, + "num_tokens": 15074874.0, + "reward": 0.509765625, + "reward_std": 0.23126041889190674, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.38427734375, - "rewards/tag_count_reward/std": 0.16227306425571442, + "rewards/tag_count_reward/mean": 0.380859375, + "rewards/tag_count_reward/std": 0.1740640103816986, "step": 16 }, { @@ -479,27 +479,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.486328125, + "completions/clipped_ratio": 0.583984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1559.234375, - "completions/mean_terminated_length": 1096.4866943359375, - "completions/min_length": 23.0, - "completions/min_terminated_length": 23.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1658.314453125, + "completions/mean_terminated_length": 1111.2911376953125, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, "epoch": 0.005803533327643595, - "grad_norm": 0.10745397210121155, - "kl": 0.00081634521484375, - "learning_rate": 5.47945205479452e-08, - "loss": 0.0947, - "num_tokens": 15769087.0, - "reward": 0.53662109375, - "reward_std": 0.2537875771522522, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, + "grad_norm": 0.10060305893421173, + "kl": 0.0005750656127929688, + "learning_rate": 5.460750853242321e-08, + "loss": 0.0727, + "num_tokens": 15999659.0, + "reward": 0.46630859375, + "reward_std": 0.2180027961730957, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.42919921875, - "rewards/tag_count_reward/std": 0.20209747552871704, + "rewards/tag_count_reward/mean": 0.38818359375, + "rewards/tag_count_reward/std": 0.17010420560836792, "step": 17 }, { @@ -508,27 +508,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.609375, + "completions/clipped_ratio": 0.638671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1708.259765625, - "completions/mean_terminated_length": 1178.2650146484375, - "completions/min_length": 218.0, - "completions/min_terminated_length": 218.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1756.791015625, + "completions/mean_terminated_length": 1242.0594482421875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, "epoch": 0.006144917641034395, - "grad_norm": 0.10190171003341675, - "kl": 0.0008611679077148438, - "learning_rate": 5.821917808219177e-08, - "loss": 0.115, - "num_tokens": 16726676.0, - "reward": 0.45458984375, - "reward_std": 0.20294985175132751, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, + "grad_norm": 0.09018860757350922, + "kl": 0.0005846023559570312, + "learning_rate": 5.802047781569966e-08, + "loss": 0.0718, + "num_tokens": 16982096.0, + "reward": 0.43701171875, + "reward_std": 0.20313873887062073, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.39013671875, - "rewards/tag_count_reward/std": 0.19725459814071655, + "rewards/tag_count_reward/mean": 0.36669921875, + "rewards/tag_count_reward/std": 0.16969364881515503, "step": 18 }, { @@ -537,27 +537,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.56640625, + "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1668.79296875, - "completions/mean_terminated_length": 1173.4324951171875, - "completions/min_length": 250.0, - "completions/min_terminated_length": 250.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1758.98046875, + "completions/mean_terminated_length": 1243.771728515625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.006486301954425194, - "grad_norm": 0.09239482879638672, - "kl": 0.0009212493896484375, - "learning_rate": 6.164383561643836e-08, - "loss": 0.0917, - "num_tokens": 17658442.0, - "reward": 0.4638671875, - "reward_std": 0.1933201402425766, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 0.08947055041790009, + "kl": 0.0005474090576171875, + "learning_rate": 6.14334470989761e-08, + "loss": 0.0693, + "num_tokens": 17960038.0, + "reward": 0.44189453125, + "reward_std": 0.1830744743347168, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.3974609375, - "rewards/tag_count_reward/std": 0.1863318681716919, + "rewards/tag_count_reward/mean": 0.37353515625, + "rewards/tag_count_reward/std": 0.17345291376113892, "step": 19 }, { @@ -566,27 +566,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.529296875, + "completions/clipped_ratio": 0.611328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 1626.255859375, - "completions/mean_terminated_length": 1152.012451171875, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1711.236328125, + "completions/mean_terminated_length": 1181.552734375, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, "epoch": 0.006827686267815994, - "grad_norm": 0.11028622090816498, - "kl": 0.0010242462158203125, - "learning_rate": 6.506849315068492e-08, - "loss": 0.1159, - "num_tokens": 18568301.0, - "reward": 0.521484375, - "reward_std": 0.2340538650751114, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, + "grad_norm": 0.11081380397081375, + "kl": 0.0005693435668945312, + "learning_rate": 6.484641638225255e-08, + "loss": 0.1089, + "num_tokens": 18913407.0, + "reward": 0.46630859375, + "reward_std": 0.196451336145401, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.4140625, - "rewards/tag_count_reward/std": 0.20196563005447388, + "rewards/tag_count_reward/mean": 0.37841796875, + "rewards/tag_count_reward/std": 0.16841623187065125, "step": 20 }, { @@ -595,27 +595,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.5078125, + "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1615.5078125, - "completions/mean_terminated_length": 1169.2857666015625, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1677.201171875, + "completions/mean_terminated_length": 1169.0694580078125, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, "epoch": 0.007169070581206793, - "grad_norm": 0.10006025433540344, - "kl": 0.0010442733764648438, - "learning_rate": 6.84931506849315e-08, - "loss": 0.0826, - "num_tokens": 19468801.0, - "reward": 0.5556640625, - "reward_std": 0.2516134977340698, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, + "grad_norm": 0.09660843759775162, + "kl": 0.0005202293395996094, + "learning_rate": 6.8259385665529e-08, + "loss": 0.0893, + "num_tokens": 19845494.0, + "reward": 0.52490234375, + "reward_std": 0.26504456996917725, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.4150390625, - "rewards/tag_count_reward/std": 0.19436383247375488, + "rewards/tag_count_reward/mean": 0.38818359375, + "rewards/tag_count_reward/std": 0.1664702594280243, "step": 21 }, { @@ -624,27 +624,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.529296875, + "completions/clipped_ratio": 0.58984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1643.8984375, - "completions/mean_terminated_length": 1189.493896484375, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1712.4140625, + "completions/mean_terminated_length": 1229.8095703125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, "epoch": 0.007510454894597594, - "grad_norm": 0.1029755026102066, - "kl": 0.0013523101806640625, - "learning_rate": 7.191780821917807e-08, - "loss": 0.0927, - "num_tokens": 20393389.0, - "reward": 0.537109375, - "reward_std": 0.2739160656929016, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, + "grad_norm": 0.08789636939764023, + "kl": 0.0005578994750976562, + "learning_rate": 7.167235494880546e-08, + "loss": 0.0668, + "num_tokens": 20805162.0, + "reward": 0.49267578125, + "reward_std": 0.23429395258426666, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.408203125, - "rewards/tag_count_reward/std": 0.19122402369976044, + "rewards/tag_count_reward/mean": 0.37353515625, + "rewards/tag_count_reward/std": 0.16253195703029633, "step": 22 }, { @@ -653,27 +653,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.578125, + "completions/clipped_ratio": 0.638671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1683.533203125, - "completions/mean_terminated_length": 1184.0787353515625, - "completions/min_length": 335.0, - "completions/min_terminated_length": 335.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1752.947265625, + "completions/mean_terminated_length": 1231.421630859375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, "epoch": 0.007851839207988393, - "grad_norm": 0.10230695456266403, - "kl": 0.0011806488037109375, - "learning_rate": 7.534246575342466e-08, - "loss": 0.1142, - "num_tokens": 21327022.0, - "reward": 0.486328125, - "reward_std": 0.27032989263534546, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, + "grad_norm": 0.09756770730018616, + "kl": 0.0005750656127929688, + "learning_rate": 7.508532423208192e-08, + "loss": 0.1014, + "num_tokens": 21774335.0, + "reward": 0.4443359375, + "reward_std": 0.23254692554473877, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.412109375, - "rewards/tag_count_reward/std": 0.21004575490951538, + "rewards/tag_count_reward/mean": 0.3779296875, + "rewards/tag_count_reward/std": 0.17830252647399902, "step": 23 }, { @@ -682,27 +682,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.478515625, + "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 1499.361328125, - "completions/mean_terminated_length": 995.9288330078125, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1629.5703125, + "completions/mean_terminated_length": 1091.58935546875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, "epoch": 0.008193223521379193, - "grad_norm": 0.11834343522787094, - "kl": 0.0013599395751953125, - "learning_rate": 7.876712328767122e-08, - "loss": 0.129, - "num_tokens": 22174311.0, - "reward": 0.49951171875, - "reward_std": 0.22222651541233063, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, + "grad_norm": 0.09879060089588165, + "kl": 0.0005731582641601562, + "learning_rate": 7.849829351535836e-08, + "loss": 0.0681, + "num_tokens": 22688291.0, + "reward": 0.43310546875, + "reward_std": 0.14599718153476715, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.44287109375, - "rewards/tag_count_reward/std": 0.2110753357410431, + "rewards/tag_count_reward/mean": 0.38232421875, + "rewards/tag_count_reward/std": 0.15701180696487427, "step": 24 }, { @@ -711,27 +711,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.51171875, + "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1673.552734375, - "completions/mean_terminated_length": 1281.132080078125, - "completions/min_length": 299.0, - "completions/min_terminated_length": 299.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1763.1328125, + "completions/mean_terminated_length": 1288.354248046875, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, "epoch": 0.008534607834769992, - "grad_norm": 0.09323837608098984, - "kl": 0.0014896392822265625, - "learning_rate": 8.21917808219178e-08, - "loss": 0.0923, - "num_tokens": 23108274.0, - "reward": 0.470703125, - "reward_std": 0.22797748446464539, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, + "grad_norm": 0.08964110165834427, + "kl": 0.0005235671997070312, + "learning_rate": 8.19112627986348e-08, + "loss": 0.0905, + "num_tokens": 23668119.0, + "reward": 0.41455078125, + "reward_std": 0.18736442923545837, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.427734375, - "rewards/tag_count_reward/std": 0.20607776939868927, + "rewards/tag_count_reward/mean": 0.37353515625, + "rewards/tag_count_reward/std": 0.16101987659931183, "step": 25 }, { @@ -740,27 +740,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.416015625, + "completions/clipped_ratio": 0.5078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1454.22265625, - "completions/mean_terminated_length": 1031.230712890625, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1573.40234375, + "completions/mean_terminated_length": 1083.7381591796875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, "epoch": 0.008875992148160792, - "grad_norm": 0.11998679488897324, - "kl": 0.0018024444580078125, - "learning_rate": 8.561643835616438e-08, - "loss": 0.0999, - "num_tokens": 23931380.0, - "reward": 0.64453125, - "reward_std": 0.3692547380924225, - "rewards/accuracy_reward/mean": 0.19153225421905518, - "rewards/accuracy_reward/std": 0.3939041793346405, + "grad_norm": 0.1042899489402771, + "kl": 0.0005631446838378906, + "learning_rate": 8.532423208191126e-08, + "loss": 0.0745, + "num_tokens": 24552245.0, + "reward": 0.54150390625, + "reward_std": 0.2826315462589264, + "rewards/accuracy_reward/mean": 0.13306452333927155, + "rewards/accuracy_reward/std": 0.3399873375892639, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.458984375, - "rewards/tag_count_reward/std": 0.2099001258611679, + "rewards/tag_count_reward/mean": 0.41259765625, + "rewards/tag_count_reward/std": 0.17570793628692627, "step": 26 }, { @@ -769,27 +769,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.443359375, + "completions/clipped_ratio": 0.52734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 1422.546875, - "completions/mean_terminated_length": 924.3789672851562, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1535.490234375, + "completions/mean_terminated_length": 963.6817626953125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.009217376461551591, - "grad_norm": 0.12516747415065765, - "kl": 0.0019092559814453125, - "learning_rate": 8.904109589041094e-08, - "loss": 0.1022, - "num_tokens": 24729004.0, - "reward": 0.51806640625, - "reward_std": 0.24743953347206116, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, + "grad_norm": 0.11625460535287857, + "kl": 0.0005464553833007812, + "learning_rate": 8.873720136518772e-08, + "loss": 0.1045, + "num_tokens": 25407696.0, + "reward": 0.4208984375, + "reward_std": 0.16960611939430237, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15143637359142303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.46337890625, - "rewards/tag_count_reward/std": 0.2377152144908905, + "rewards/tag_count_reward/mean": 0.3974609375, + "rewards/tag_count_reward/std": 0.1727055311203003, "step": 27 }, { @@ -798,27 +798,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.517578125, + "completions/clipped_ratio": 0.630859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1586.744140625, - "completions/mean_terminated_length": 1091.87451171875, - "completions/min_length": 14.0, - "completions/min_terminated_length": 14.0, + "completions/mean_length": 1699.21875, + "completions/mean_terminated_length": 1103.1534423828125, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, "epoch": 0.009558760774942391, - "grad_norm": 0.11038284003734589, - "kl": 0.002349853515625, - "learning_rate": 9.246575342465753e-08, - "loss": 0.1299, - "num_tokens": 25630505.0, - "reward": 0.50927734375, - "reward_std": 0.25093698501586914, - "rewards/accuracy_reward/mean": 0.058467742055654526, - "rewards/accuracy_reward/std": 0.23486268520355225, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.45068359375, - "rewards/tag_count_reward/std": 0.2440776377916336, + "grad_norm": 0.09857682883739471, + "kl": 0.0006265640258789062, + "learning_rate": 9.215017064846416e-08, + "loss": 0.0872, + "num_tokens": 26366784.0, + "reward": 0.41064453125, + "reward_std": 0.1888647824525833, + "rewards/accuracy_reward/mean": 0.04032257944345474, + "rewards/accuracy_reward/std": 0.19691328704357147, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.37158203125, + "rewards/tag_count_reward/std": 0.1776064932346344, "step": 28 }, { @@ -827,27 +827,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.486328125, + "completions/clipped_ratio": 0.65234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1575.810546875, - "completions/mean_terminated_length": 1128.756591796875, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1750.69140625, + "completions/mean_terminated_length": 1192.8201904296875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.00990014508833319, - "grad_norm": 0.11193142831325531, - "kl": 0.001926422119140625, - "learning_rate": 9.58904109589041e-08, - "loss": 0.1319, - "num_tokens": 26512584.0, - "reward": 0.48291015625, - "reward_std": 0.23605626821517944, - "rewards/accuracy_reward/mean": 0.04435483738780022, - "rewards/accuracy_reward/std": 0.2060900777578354, + "grad_norm": 0.09634055942296982, + "kl": 0.0005664825439453125, + "learning_rate": 9.556313993174062e-08, + "loss": 0.0732, + "num_tokens": 27338402.0, + "reward": 0.41015625, + "reward_std": 0.1702592521905899, + "rewards/accuracy_reward/mean": 0.052419353276491165, + "rewards/accuracy_reward/std": 0.22309617698192596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.43994140625, - "rewards/tag_count_reward/std": 0.2148621678352356, + "rewards/tag_count_reward/mean": 0.359375, + "rewards/tag_count_reward/std": 0.15244260430335999, "step": 29 }, { @@ -856,27 +856,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.462890625, + "completions/clipped_ratio": 0.615234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1476.640625, - "completions/mean_terminated_length": 984.2327270507812, - "completions/min_length": 232.0, - "completions/min_terminated_length": 232.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1678.625, + "completions/mean_terminated_length": 1088.0, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, "epoch": 0.01024152940172399, - "grad_norm": 0.11076265573501587, - "kl": 0.002689361572265625, - "learning_rate": 9.931506849315068e-08, - "loss": 0.1011, - "num_tokens": 27348592.0, - "reward": 0.60107421875, - "reward_std": 0.27622419595718384, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, + "grad_norm": 0.09100901335477829, + "kl": 0.000553131103515625, + "learning_rate": 9.897610921501706e-08, + "loss": 0.0927, + "num_tokens": 28277826.0, + "reward": 0.46923828125, + "reward_std": 0.15819703042507172, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.46826171875, - "rewards/tag_count_reward/std": 0.23790405690670013, + "rewards/tag_count_reward/mean": 0.36767578125, + "rewards/tag_count_reward/std": 0.1623731404542923, "step": 30 }, { @@ -885,27 +885,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.443359375, + "completions/clipped_ratio": 0.5390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1510.275390625, - "completions/mean_terminated_length": 1081.982421875, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1662.60546875, + "completions/mean_terminated_length": 1211.889892578125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.01058291371511479, - "grad_norm": 0.11147204041481018, - "kl": 0.002384185791015625, - "learning_rate": 1.0273972602739725e-07, - "loss": 0.1056, - "num_tokens": 28200637.0, - "reward": 0.58056640625, - "reward_std": 0.2617027759552002, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, + "grad_norm": 0.10911896824836731, + "kl": 0.0005741119384765625, + "learning_rate": 1.023890784982935e-07, + "loss": 0.0859, + "num_tokens": 29207864.0, + "reward": 0.47998046875, + "reward_std": 0.21130922436714172, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.46728515625, - "rewards/tag_count_reward/std": 0.23151643574237823, + "rewards/tag_count_reward/mean": 0.39208984375, + "rewards/tag_count_reward/std": 0.16975557804107666, "step": 31 }, { @@ -914,27 +914,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.34765625, + "completions/clipped_ratio": 0.55859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1437.0859375, - "completions/mean_terminated_length": 1111.509033203125, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1712.2265625, + "completions/mean_terminated_length": 1287.3096923828125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.010924298028505591, - "grad_norm": 0.11107554286718369, - "kl": 0.002471923828125, - "learning_rate": 1.0616438356164383e-07, - "loss": 0.1385, - "num_tokens": 29013849.0, - "reward": 0.61376953125, - "reward_std": 0.3043445944786072, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, + "grad_norm": 0.09444137662649155, + "kl": 0.0005474090576171875, + "learning_rate": 1.0580204778156996e-07, + "loss": 0.089, + "num_tokens": 30161948.0, + "reward": 0.45654296875, + "reward_std": 0.22040751576423645, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.49072265625, - "rewards/tag_count_reward/std": 0.22182057797908783, + "rewards/tag_count_reward/mean": 0.38623046875, + "rewards/tag_count_reward/std": 0.15202754735946655, "step": 32 }, { @@ -943,27 +943,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.3984375, + "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1431.798828125, - "completions/mean_terminated_length": 1023.6655883789062, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1680.62109375, + "completions/mean_terminated_length": 1025.728271484375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.01126568234189639, - "grad_norm": 0.12299497425556183, - "kl": 0.002918243408203125, - "learning_rate": 1.095890410958904e-07, - "loss": 0.1306, - "num_tokens": 29831666.0, - "reward": 0.5302734375, - "reward_std": 0.26406511664390564, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, + "grad_norm": 0.10579922050237656, + "kl": 0.00057220458984375, + "learning_rate": 1.0921501706484642e-07, + "loss": 0.0772, + "num_tokens": 31107162.0, + "reward": 0.40966796875, + "reward_std": 0.16281485557556152, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.4833984375, - "rewards/tag_count_reward/std": 0.24020493030548096, + "rewards/tag_count_reward/mean": 0.36865234375, + "rewards/tag_count_reward/std": 0.15548905730247498, "step": 33 }, { @@ -972,27 +972,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.3203125, + "completions/clipped_ratio": 0.572265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1359.208984375, - "completions/mean_terminated_length": 1034.6063232421875, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1684.06640625, + "completions/mean_terminated_length": 1197.1597900390625, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, "epoch": 0.01160706665528719, - "grad_norm": 0.1284693032503128, - "kl": 0.003650665283203125, - "learning_rate": 1.1301369863013698e-07, - "loss": 0.1532, - "num_tokens": 30600861.0, - "reward": 0.64892578125, - "reward_std": 0.31966668367385864, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, + "grad_norm": 0.10974515974521637, + "kl": 0.0005903244018554688, + "learning_rate": 1.1262798634812286e-07, + "loss": 0.1093, + "num_tokens": 32042684.0, + "reward": 0.46484375, + "reward_std": 0.20326673984527588, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.52001953125, - "rewards/tag_count_reward/std": 0.23763883113861084, + "rewards/tag_count_reward/mean": 0.388671875, + "rewards/tag_count_reward/std": 0.1619614213705063, "step": 34 }, { @@ -1001,27 +1001,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.37890625, + "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1410.498046875, - "completions/mean_terminated_length": 1021.5817260742188, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/mean_length": 1618.482421875, + "completions/mean_terminated_length": 1100.09912109375, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, "epoch": 0.01194845096867799, - "grad_norm": 0.11522830277681351, - "kl": 0.0032196044921875, - "learning_rate": 1.1643835616438355e-07, - "loss": 0.1269, - "num_tokens": 31405052.0, - "reward": 0.67529296875, - "reward_std": 0.335245817899704, - "rewards/accuracy_reward/mean": 0.169921875, - "rewards/accuracy_reward/std": 0.3759314715862274, + "grad_norm": 0.09662748873233795, + "kl": 0.0005207061767578125, + "learning_rate": 1.1604095563139932e-07, + "loss": 0.0735, + "num_tokens": 32953363.0, + "reward": 0.5390625, + "reward_std": 0.23657390475273132, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.50537109375, - "rewards/tag_count_reward/std": 0.24846979975700378, + "rewards/tag_count_reward/mean": 0.390625, + "rewards/tag_count_reward/std": 0.16403664648532867, "step": 35 }, { @@ -1030,27 +1030,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.392578125, + "completions/clipped_ratio": 0.619140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1420.86328125, - "completions/mean_terminated_length": 1015.5433959960938, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1717.830078125, + "completions/mean_terminated_length": 1181.09228515625, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, "epoch": 0.01228983528206879, - "grad_norm": 0.12051548808813095, - "kl": 0.003536224365234375, - "learning_rate": 1.1986301369863011e-07, - "loss": 0.1341, - "num_tokens": 32216230.0, - "reward": 0.5908203125, - "reward_std": 0.3006790280342102, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, + "grad_norm": 0.10695859789848328, + "kl": 0.0006036758422851562, + "learning_rate": 1.1945392491467578e-07, + "loss": 0.1211, + "num_tokens": 33916588.0, + "reward": 0.431640625, + "reward_std": 0.20863476395606995, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.5224609375, - "rewards/tag_count_reward/std": 0.26768815517425537, + "rewards/tag_count_reward/mean": 0.376953125, + "rewards/tag_count_reward/std": 0.17968250811100006, "step": 36 }, { @@ -1059,27 +1059,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.404296875, + "completions/clipped_ratio": 0.62109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1517.08203125, - "completions/mean_terminated_length": 1156.754150390625, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1744.015625, + "completions/mean_terminated_length": 1245.73193359375, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, "epoch": 0.012631219595459589, - "grad_norm": 0.10653501749038696, - "kl": 0.00284576416015625, - "learning_rate": 1.232876712328767e-07, - "loss": 0.1508, - "num_tokens": 33067920.0, - "reward": 0.5849609375, - "reward_std": 0.3052826523780823, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.5087890625, - "rewards/tag_count_reward/std": 0.257796049118042, + "grad_norm": 0.0918794795870781, + "kl": 0.0005393028259277344, + "learning_rate": 1.228668941979522e-07, + "loss": 0.0843, + "num_tokens": 34884468.0, + "reward": 0.44189453125, + "reward_std": 0.2063564956188202, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.37353515625, + "rewards/tag_count_reward/std": 0.16403010487556458, "step": 37 }, { @@ -1088,27 +1088,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.392578125, + "completions/clipped_ratio": 0.595703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 1439.75390625, - "completions/mean_terminated_length": 1046.64306640625, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1704.533203125, + "completions/mean_terminated_length": 1198.4588623046875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.012972603908850388, - "grad_norm": 0.12568874657154083, - "kl": 0.00372314453125, - "learning_rate": 1.2671232876712328e-07, - "loss": 0.1542, - "num_tokens": 33885794.0, - "reward": 0.57177734375, - "reward_std": 0.2767326831817627, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, + "grad_norm": 0.10510947555303574, + "kl": 0.0005712509155273438, + "learning_rate": 1.2627986348122866e-07, + "loss": 0.0715, + "num_tokens": 35837909.0, + "reward": 0.40380859375, + "reward_std": 0.15139544010162354, + "rewards/accuracy_reward/mean": 0.021484375, + "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.51708984375, - "rewards/tag_count_reward/std": 0.2599788010120392, + "rewards/tag_count_reward/mean": 0.38232421875, + "rewards/tag_count_reward/std": 0.16387273371219635, "step": 38 }, { @@ -1117,27 +1117,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.35546875, + "completions/clipped_ratio": 0.619140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1402.21875, - "completions/mean_terminated_length": 1046.060546875, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1736.8515625, + "completions/mean_terminated_length": 1231.035888671875, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, "epoch": 0.013313988222241188, - "grad_norm": 0.12105520069599152, - "kl": 0.00417327880859375, - "learning_rate": 1.3013698630136985e-07, - "loss": 0.1481, - "num_tokens": 34681634.0, - "reward": 0.6513671875, - "reward_std": 0.31875500082969666, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, + "grad_norm": 0.09516466408967972, + "kl": 0.0005741119384765625, + "learning_rate": 1.296928327645051e-07, + "loss": 0.0776, + "num_tokens": 36805081.0, + "reward": 0.45263671875, + "reward_std": 0.21325279772281647, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.5537109375, - "rewards/tag_count_reward/std": 0.27545711398124695, + "rewards/tag_count_reward/mean": 0.37646484375, + "rewards/tag_count_reward/std": 0.17832061648368835, "step": 39 }, { @@ -1146,27 +1146,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.5, + "completions/clipped_ratio": 0.69921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1559.27734375, - "completions/mean_terminated_length": 1070.5546875, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1816.265625, + "completions/mean_terminated_length": 1277.5584716796875, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, "epoch": 0.013655372535631987, - "grad_norm": 0.10639850795269012, - "kl": 0.003894805908203125, - "learning_rate": 1.3356164383561644e-07, - "loss": 0.1527, - "num_tokens": 35560576.0, - "reward": 0.51171875, - "reward_std": 0.27677735686302185, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, + "grad_norm": 0.08078555017709732, + "kl": 0.0005369186401367188, + "learning_rate": 1.3310580204778158e-07, + "loss": 0.0623, + "num_tokens": 37815601.0, + "reward": 0.40087890625, + "reward_std": 0.15098059177398682, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.46875, - "rewards/tag_count_reward/std": 0.2612442076206207, + "rewards/tag_count_reward/mean": 0.34033203125, + "rewards/tag_count_reward/std": 0.1318589299917221, "step": 40 }, { @@ -1175,27 +1175,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.390625, + "completions/clipped_ratio": 0.5859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1423.3515625, - "completions/mean_terminated_length": 1022.9359130859375, - "completions/min_length": 193.0, - "completions/min_terminated_length": 193.0, + "completions/mean_length": 1700.818359375, + "completions/mean_terminated_length": 1209.5235595703125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, "epoch": 0.013996756849022787, - "grad_norm": 0.117813341319561, - "kl": 0.00411224365234375, - "learning_rate": 1.36986301369863e-07, - "loss": 0.1249, - "num_tokens": 36360196.0, - "reward": 0.6103515625, - "reward_std": 0.2959108352661133, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, + "grad_norm": 0.10072726011276245, + "kl": 0.0005693435668945312, + "learning_rate": 1.36518771331058e-07, + "loss": 0.1016, + "num_tokens": 38757284.0, + "reward": 0.44970703125, + "reward_std": 0.22127588093280792, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.5380859375, - "rewards/tag_count_reward/std": 0.27805349230766296, + "rewards/tag_count_reward/mean": 0.38330078125, + "rewards/tag_count_reward/std": 0.17041288316249847, "step": 41 }, { @@ -1204,27 +1204,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.423828125, + "completions/clipped_ratio": 0.6796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1436.888671875, - "completions/mean_terminated_length": 987.3593139648438, - "completions/min_length": 40.0, - "completions/min_terminated_length": 40.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1764.595703125, + "completions/mean_terminated_length": 1163.2255859375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, "epoch": 0.014338141162413586, - "grad_norm": 0.13158966600894928, - "kl": 0.0052337646484375, - "learning_rate": 1.4041095890410958e-07, - "loss": 0.1634, - "num_tokens": 37181243.0, - "reward": 0.6552734375, - "reward_std": 0.3702911138534546, - "rewards/accuracy_reward/mean": 0.12109375, - "rewards/accuracy_reward/std": 0.3265552520751953, + "grad_norm": 0.09533902257680893, + "kl": 0.000606536865234375, + "learning_rate": 1.3993174061433446e-07, + "loss": 0.076, + "num_tokens": 39746117.0, + "reward": 0.41259765625, + "reward_std": 0.17641165852546692, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.5341796875, - "rewards/tag_count_reward/std": 0.2790003716945648, + "rewards/tag_count_reward/mean": 0.35595703125, + "rewards/tag_count_reward/std": 0.16065748035907745, "step": 42 }, { @@ -1233,27 +1233,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.416015625, + "completions/clipped_ratio": 0.64453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1464.73046875, - "completions/mean_terminated_length": 1049.22412109375, - "completions/min_length": 17.0, - "completions/min_terminated_length": 17.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1757.072265625, + "completions/mean_terminated_length": 1229.56591796875, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, "epoch": 0.014679525475804386, - "grad_norm": 0.11569352447986603, - "kl": 0.00490570068359375, - "learning_rate": 1.4383561643835615e-07, - "loss": 0.1364, - "num_tokens": 38004785.0, - "reward": 0.65087890625, - "reward_std": 0.3135306239128113, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, + "grad_norm": 0.09078851342201233, + "kl": 0.0004987716674804688, + "learning_rate": 1.4334470989761092e-07, + "loss": 0.0848, + "num_tokens": 40719338.0, + "reward": 0.4482421875, + "reward_std": 0.1925346851348877, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.53759765625, - "rewards/tag_count_reward/std": 0.26940807700157166, + "rewards/tag_count_reward/mean": 0.3701171875, + "rewards/tag_count_reward/std": 0.17268340289592743, "step": 43 }, { @@ -1262,27 +1262,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.318359375, + "completions/clipped_ratio": 0.568359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1345.845703125, - "completions/mean_terminated_length": 1017.9054565429688, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1692.580078125, + "completions/mean_terminated_length": 1224.583740234375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.015020909789195187, - "grad_norm": 0.11853042989969254, - "kl": 0.0054168701171875, - "learning_rate": 1.4726027397260274e-07, - "loss": 0.1455, - "num_tokens": 38764066.0, - "reward": 0.72412109375, - "reward_std": 0.38516783714294434, - "rewards/accuracy_reward/mean": 0.162109375, - "rewards/accuracy_reward/std": 0.3689115643501282, + "grad_norm": 0.10198905318975449, + "kl": 0.0005292892456054688, + "learning_rate": 1.4675767918088735e-07, + "loss": 0.0863, + "num_tokens": 41656147.0, + "reward": 0.48876953125, + "reward_std": 0.23672613501548767, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.56201171875, - "rewards/tag_count_reward/std": 0.27570804953575134, + "rewards/tag_count_reward/mean": 0.38720703125, + "rewards/tag_count_reward/std": 0.14951784908771515, "step": 44 }, { @@ -1291,27 +1291,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.28125, + "completions/clipped_ratio": 0.57421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 1304.82421875, - "completions/mean_terminated_length": 1014.0162963867188, - "completions/min_length": 41.0, - "completions/min_terminated_length": 41.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1717.435546875, + "completions/mean_terminated_length": 1271.62841796875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, "epoch": 0.015362294102585987, - "grad_norm": 0.11778295040130615, - "kl": 0.00571441650390625, - "learning_rate": 1.506849315068493e-07, - "loss": 0.1641, - "num_tokens": 39513896.0, - "reward": 0.6630859375, - "reward_std": 0.2909233570098877, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, + "grad_norm": 0.08921471983194351, + "kl": 0.0005221366882324219, + "learning_rate": 1.5017064846416383e-07, + "loss": 0.0709, + "num_tokens": 42617234.0, + "reward": 0.4248046875, + "reward_std": 0.14722687005996704, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.6005859375, - "rewards/tag_count_reward/std": 0.274289071559906, + "rewards/tag_count_reward/mean": 0.3857421875, + "rewards/tag_count_reward/std": 0.1731255054473877, "step": 45 }, { @@ -1320,27 +1320,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.34765625, + "completions/clipped_ratio": 0.66015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1399.044921875, - "completions/mean_terminated_length": 1053.1947021484375, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1792.638671875, + "completions/mean_terminated_length": 1296.5919189453125, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, "epoch": 0.015703678415976786, - "grad_norm": 0.12609730660915375, - "kl": 0.0064697265625, - "learning_rate": 1.5410958904109588e-07, - "loss": 0.1506, - "num_tokens": 40304655.0, - "reward": 0.64892578125, - "reward_std": 0.3167800009250641, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, + "grad_norm": 0.08767879009246826, + "kl": 0.0005807876586914062, + "learning_rate": 1.5358361774744026e-07, + "loss": 0.0817, + "num_tokens": 43609513.0, + "reward": 0.4345703125, + "reward_std": 0.16691412031650543, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.55126953125, - "rewards/tag_count_reward/std": 0.26111936569213867, + "rewards/tag_count_reward/mean": 0.3583984375, + "rewards/tag_count_reward/std": 0.15472902357578278, "step": 46 }, { @@ -1349,27 +1349,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.287109375, + "completions/clipped_ratio": 0.595703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 1281.595703125, - "completions/mean_terminated_length": 972.9342651367188, - "completions/min_length": 74.0, - "completions/min_terminated_length": 74.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1705.353515625, + "completions/mean_terminated_length": 1200.4879150390625, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, "epoch": 0.016045062729367586, - "grad_norm": 0.1254846602678299, - "kl": 0.00629425048828125, - "learning_rate": 1.5753424657534245e-07, - "loss": 0.1563, - "num_tokens": 41036672.0, - "reward": 0.69189453125, - "reward_std": 0.2690476179122925, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, + "grad_norm": 0.09582287073135376, + "kl": 0.000545501708984375, + "learning_rate": 1.5699658703071672e-07, + "loss": 0.065, + "num_tokens": 44558494.0, + "reward": 0.4609375, + "reward_std": 0.1579788625240326, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.61767578125, - "rewards/tag_count_reward/std": 0.28142958879470825, + "rewards/tag_count_reward/mean": 0.3984375, + "rewards/tag_count_reward/std": 0.18817149102687836, "step": 47 }, { @@ -1378,27 +1378,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.298828125, + "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1328.29296875, - "completions/mean_terminated_length": 1021.5654296875, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1772.978515625, + "completions/mean_terminated_length": 1247.9375, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, "epoch": 0.016386447042758386, - "grad_norm": 0.1133851557970047, - "kl": 0.0064239501953125, - "learning_rate": 1.6095890410958904e-07, - "loss": 0.1806, - "num_tokens": 41797462.0, - "reward": 0.67578125, - "reward_std": 0.3162664771080017, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, + "grad_norm": 0.09797421842813492, + "kl": 0.0005407333374023438, + "learning_rate": 1.6040955631399318e-07, + "loss": 0.1087, + "num_tokens": 45546963.0, + "reward": 0.40185546875, + "reward_std": 0.16916505992412567, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.611328125, - "rewards/tag_count_reward/std": 0.2759232819080353, + "rewards/tag_count_reward/mean": 0.35693359375, + "rewards/tag_count_reward/std": 0.1456000804901123, "step": 48 }, { @@ -1407,27 +1407,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.29296875, + "completions/clipped_ratio": 0.62890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1305.4296875, - "completions/mean_terminated_length": 997.73486328125, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1719.7109375, + "completions/mean_terminated_length": 1163.347412109375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, "epoch": 0.016727831356149185, - "grad_norm": 0.14507456123828888, - "kl": 0.0088348388671875, - "learning_rate": 1.643835616438356e-07, - "loss": 0.1383, - "num_tokens": 42539234.0, - "reward": 0.70068359375, - "reward_std": 0.3099219799041748, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, + "grad_norm": 0.10793397575616837, + "kl": 0.0005192756652832031, + "learning_rate": 1.638225255972696e-07, + "loss": 0.0715, + "num_tokens": 46500847.0, + "reward": 0.4375, + "reward_std": 0.1823108047246933, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.61865234375, - "rewards/tag_count_reward/std": 0.2870470881462097, + "rewards/tag_count_reward/mean": 0.369140625, + "rewards/tag_count_reward/std": 0.1593923270702362, "step": 49 }, { @@ -1436,27 +1436,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.24609375, + "completions/clipped_ratio": 0.552734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1211.787109375, - "completions/mean_terminated_length": 938.826416015625, - "completions/min_length": 37.0, - "completions/min_terminated_length": 37.0, + "completions/mean_length": 1670.041015625, + "completions/mean_terminated_length": 1202.956298828125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, "epoch": 0.017069215669539985, - "grad_norm": 0.160403773188591, - "kl": 0.010650634765625, - "learning_rate": 1.6780821917808218e-07, - "loss": 0.1423, - "num_tokens": 43227733.0, - "reward": 0.76904296875, - "reward_std": 0.36772122979164124, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, + "grad_norm": 0.09364376217126846, + "kl": 0.0005340576171875, + "learning_rate": 1.6723549488054606e-07, + "loss": 0.0824, + "num_tokens": 47423972.0, + "reward": 0.490234375, + "reward_std": 0.19551226496696472, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.64208984375, - "rewards/tag_count_reward/std": 0.2827405035495758, + "rewards/tag_count_reward/mean": 0.380859375, + "rewards/tag_count_reward/std": 0.15152467787265778, "step": 50 }, { @@ -1465,27 +1465,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.26953125, + "completions/clipped_ratio": 0.576171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 1242.466796875, - "completions/mean_terminated_length": 945.2379760742188, - "completions/min_length": 19.0, - "completions/min_terminated_length": 19.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1711.96484375, + "completions/mean_terminated_length": 1255.142822265625, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, "epoch": 0.017410599982930784, - "grad_norm": 0.13634954392910004, - "kl": 0.0076904296875, - "learning_rate": 1.7123287671232875e-07, - "loss": 0.1621, - "num_tokens": 43943108.0, - "reward": 0.79931640625, - "reward_std": 0.36965513229370117, - "rewards/accuracy_reward/mean": 0.1640625, - "rewards/accuracy_reward/std": 0.37069445848464966, + "grad_norm": 0.09366326779127121, + "kl": 0.00054931640625, + "learning_rate": 1.7064846416382252e-07, + "loss": 0.0825, + "num_tokens": 48379730.0, + "reward": 0.51318359375, + "reward_std": 0.24613450467586517, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.63525390625, - "rewards/tag_count_reward/std": 0.28863388299942017, + "rewards/tag_count_reward/mean": 0.38623046875, + "rewards/tag_count_reward/std": 0.17238640785217285, "step": 51 }, { @@ -1494,27 +1494,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.232421875, + "completions/clipped_ratio": 0.61328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1197.734375, - "completions/mean_terminated_length": 940.2748413085938, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1696.419921875, + "completions/mean_terminated_length": 1138.8636474609375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.017751984296321584, - "grad_norm": 0.134590283036232, - "kl": 0.0091094970703125, - "learning_rate": 1.7465753424657535e-07, - "loss": 0.1779, - "num_tokens": 44634668.0, - "reward": 0.69775390625, - "reward_std": 0.2980614900588989, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, + "grad_norm": 0.11035492271184921, + "kl": 0.0005512237548828125, + "learning_rate": 1.7406143344709898e-07, + "loss": 0.0892, + "num_tokens": 49326617.0, + "reward": 0.419921875, + "reward_std": 0.18128812313079834, + "rewards/accuracy_reward/mean": 0.037109375, + "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.63330078125, - "rewards/tag_count_reward/std": 0.27031806111335754, + "rewards/tag_count_reward/mean": 0.3828125, + "rewards/tag_count_reward/std": 0.17186526954174042, "step": 52 }, { @@ -1523,27 +1523,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.271484375, + "completions/clipped_ratio": 0.619140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1235.2265625, - "completions/mean_terminated_length": 932.3432006835938, - "completions/min_length": 24.0, - "completions/min_terminated_length": 24.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1714.14453125, + "completions/mean_terminated_length": 1171.4154052734375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.018093368609712383, - "grad_norm": 0.4131734073162079, - "kl": 0.011260986328125, - "learning_rate": 1.780821917808219e-07, - "loss": 0.1614, - "num_tokens": 45341264.0, - "reward": 0.759765625, - "reward_std": 0.35761427879333496, - "rewards/accuracy_reward/mean": 0.134765625, - "rewards/accuracy_reward/std": 0.3418070077896118, + "grad_norm": 0.10192305594682693, + "kl": 0.0005540847778320312, + "learning_rate": 1.7747440273037543e-07, + "loss": 0.0876, + "num_tokens": 50278419.0, + "reward": 0.45654296875, + "reward_std": 0.2064102292060852, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.625, - "rewards/tag_count_reward/std": 0.2789061665534973, + "rewards/tag_count_reward/mean": 0.37255859375, + "rewards/tag_count_reward/std": 0.15242300927639008, "step": 53 }, { @@ -1552,27 +1552,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.248046875, + "completions/clipped_ratio": 0.541015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1147.494140625, - "completions/mean_terminated_length": 850.4441528320312, - "completions/min_length": 25.0, - "completions/min_terminated_length": 25.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1619.599609375, + "completions/mean_terminated_length": 1114.634033203125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, "epoch": 0.018434752923103183, - "grad_norm": 0.14847879111766815, - "kl": 0.0095977783203125, - "learning_rate": 1.8150684931506848e-07, - "loss": 0.1749, - "num_tokens": 46011789.0, - "reward": 0.76416015625, - "reward_std": 0.3492337465286255, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, + "grad_norm": 0.11204086244106293, + "kl": 0.0006103515625, + "learning_rate": 1.8088737201365186e-07, + "loss": 0.0994, + "num_tokens": 51190662.0, + "reward": 0.4794921875, + "reward_std": 0.23044265806674957, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.64697265625, - "rewards/tag_count_reward/std": 0.28541797399520874, + "rewards/tag_count_reward/mean": 0.4072265625, + "rewards/tag_count_reward/std": 0.1951880007982254, "step": 54 }, { @@ -1581,27 +1581,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.25, + "completions/clipped_ratio": 0.57421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1239.833984375, - "completions/mean_terminated_length": 970.4453125, - "completions/min_length": 86.0, - "completions/min_terminated_length": 86.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1720.3984375, + "completions/mean_terminated_length": 1278.587158203125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, "epoch": 0.018776137236493982, - "grad_norm": 0.12348626554012299, - "kl": 0.006805419921875, - "learning_rate": 1.8493150684931505e-07, - "loss": 0.1739, - "num_tokens": 46719448.0, - "reward": 0.73828125, - "reward_std": 0.35028383135795593, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, + "grad_norm": 0.09342999011278152, + "kl": 0.0005502700805664062, + "learning_rate": 1.8430034129692832e-07, + "loss": 0.1088, + "num_tokens": 52144370.0, + "reward": 0.45654296875, + "reward_std": 0.20446324348449707, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.630859375, - "rewards/tag_count_reward/std": 0.27752548456192017, + "rewards/tag_count_reward/mean": 0.39208984375, + "rewards/tag_count_reward/std": 0.16903352737426758, "step": 55 }, { @@ -1610,27 +1610,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.150390625, + "completions/clipped_ratio": 0.404296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1010.849609375, - "completions/mean_terminated_length": 827.2620849609375, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1505.333984375, + "completions/mean_terminated_length": 1137.0328369140625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, "epoch": 0.019117521549884782, - "grad_norm": 0.14299838244915009, - "kl": 0.00946044921875, - "learning_rate": 1.8835616438356165e-07, - "loss": 0.1366, - "num_tokens": 47314203.0, - "reward": 0.8544921875, - "reward_std": 0.3641713857650757, - "rewards/accuracy_reward/mean": 0.15234375, - "rewards/accuracy_reward/std": 0.35970520973205566, + "grad_norm": 0.11249049752950668, + "kl": 0.0005698204040527344, + "learning_rate": 1.8771331058020475e-07, + "loss": 0.1152, + "num_tokens": 52992301.0, + "reward": 0.54345703125, + "reward_std": 0.25522473454475403, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7021484375, - "rewards/tag_count_reward/std": 0.26891329884529114, + "rewards/tag_count_reward/mean": 0.43798828125, + "rewards/tag_count_reward/std": 0.18946638703346252, "step": 56 }, { @@ -1639,27 +1639,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.26171875, + "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1245.9609375, - "completions/mean_terminated_length": 961.6401977539062, - "completions/min_length": 64.0, - "completions/min_terminated_length": 64.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1778.322265625, + "completions/mean_terminated_length": 1263.4830322265625, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, "epoch": 0.01945890586327558, - "grad_norm": 0.14915452897548676, - "kl": 0.00835418701171875, - "learning_rate": 1.917808219178082e-07, - "loss": 0.2204, - "num_tokens": 48030055.0, - "reward": 0.7587890625, - "reward_std": 0.3553071618080139, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, + "grad_norm": 0.09423235803842545, + "kl": 0.000568389892578125, + "learning_rate": 1.9112627986348124e-07, + "loss": 0.089, + "num_tokens": 53980722.0, + "reward": 0.40185546875, + "reward_std": 0.16317158937454224, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.6572265625, - "rewards/tag_count_reward/std": 0.2924967110157013, + "rewards/tag_count_reward/mean": 0.36083984375, + "rewards/tag_count_reward/std": 0.1525859236717224, "step": 57 }, { @@ -1668,27 +1668,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.314453125, + "completions/clipped_ratio": 0.67578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1291.556640625, - "completions/mean_terminated_length": 944.5840454101562, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1807.017578125, + "completions/mean_terminated_length": 1304.7288818359375, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, "epoch": 0.01980029017666638, - "grad_norm": 0.12248866260051727, - "kl": 0.00811004638671875, - "learning_rate": 1.9520547945205478e-07, - "loss": 0.2033, - "num_tokens": 48783668.0, - "reward": 0.697265625, - "reward_std": 0.3434180021286011, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, + "grad_norm": 0.09344030171632767, + "kl": 0.00055694580078125, + "learning_rate": 1.9453924914675767e-07, + "loss": 0.0798, + "num_tokens": 54998251.0, + "reward": 0.431640625, + "reward_std": 0.20107224583625793, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.607421875, - "rewards/tag_count_reward/std": 0.2916542887687683, + "rewards/tag_count_reward/mean": 0.35546875, + "rewards/tag_count_reward/std": 0.15752294659614563, "step": 58 }, { @@ -1697,27 +1697,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.28515625, + "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1229.19140625, - "completions/mean_terminated_length": 902.5628051757812, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1739.4375, + "completions/mean_terminated_length": 1189.391357421875, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, "epoch": 0.02014167449005718, - "grad_norm": 0.12459442764520645, - "kl": 0.00914764404296875, - "learning_rate": 1.9863013698630135e-07, - "loss": 0.2028, - "num_tokens": 49481446.0, - "reward": 0.74169921875, - "reward_std": 0.37255799770355225, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, + "grad_norm": 0.0854397565126419, + "kl": 0.0005483627319335938, + "learning_rate": 1.9795221843003412e-07, + "loss": 0.0912, + "num_tokens": 55957275.0, + "reward": 0.45361328125, + "reward_std": 0.18842391669750214, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.64599609375, - "rewards/tag_count_reward/std": 0.2980671226978302, + "rewards/tag_count_reward/mean": 0.36181640625, + "rewards/tag_count_reward/std": 0.16275520622730255, "step": 59 }, { @@ -1726,27 +1726,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.275390625, + "completions/clipped_ratio": 0.564453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1194.38671875, - "completions/mean_terminated_length": 869.9676513671875, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1649.150390625, + "completions/mean_terminated_length": 1132.255615234375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, "epoch": 0.02048305880344798, - "grad_norm": 0.1351117342710495, - "kl": 0.0087432861328125, - "learning_rate": 2.0205479452054795e-07, - "loss": 0.1894, - "num_tokens": 50176396.0, - "reward": 0.73193359375, - "reward_std": 0.34207260608673096, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, + "grad_norm": 0.09843935072422028, + "kl": 0.0005526542663574219, + "learning_rate": 2.0136518771331058e-07, + "loss": 0.0842, + "num_tokens": 56885064.0, + "reward": 0.4384765625, + "reward_std": 0.19701939821243286, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.64404296875, - "rewards/tag_count_reward/std": 0.2973770797252655, + "rewards/tag_count_reward/mean": 0.3916015625, + "rewards/tag_count_reward/std": 0.1747734695672989, "step": 60 }, { @@ -1755,27 +1755,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.33203125, + "completions/clipped_ratio": 0.576171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1349.58984375, - "completions/mean_terminated_length": 1002.4268798828125, - "completions/min_length": 87.0, - "completions/min_terminated_length": 87.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1742.107421875, + "completions/mean_terminated_length": 1326.2626953125, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, "epoch": 0.02082444311683878, - "grad_norm": 0.11476152390241623, - "kl": 0.00765228271484375, - "learning_rate": 2.054794520547945e-07, - "loss": 0.1738, - "num_tokens": 50941514.0, - "reward": 0.73388671875, - "reward_std": 0.3434625267982483, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, + "grad_norm": 0.0872185230255127, + "kl": 0.0005540847778320312, + "learning_rate": 2.04778156996587e-07, + "loss": 0.0743, + "num_tokens": 57851151.0, + "reward": 0.48388671875, + "reward_std": 0.22081714868545532, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.60107421875, - "rewards/tag_count_reward/std": 0.2890937030315399, + "rewards/tag_count_reward/mean": 0.38232421875, + "rewards/tag_count_reward/std": 0.15856212377548218, "step": 61 }, { @@ -1784,27 +1784,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1953125, + "completions/clipped_ratio": 0.447265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1068.947265625, - "completions/mean_terminated_length": 831.3131103515625, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/mean_length": 1565.787109375, + "completions/mean_terminated_length": 1175.5865478515625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, "epoch": 0.02116582743022958, - "grad_norm": 0.14407098293304443, - "kl": 0.0102691650390625, - "learning_rate": 2.0890410958904109e-07, - "loss": 0.1951, - "num_tokens": 51562751.0, - "reward": 0.8681640625, - "reward_std": 0.33195409178733826, - "rewards/accuracy_reward/mean": 0.171875, - "rewards/accuracy_reward/std": 0.3776407241821289, + "grad_norm": 0.10896014422178268, + "kl": 0.0005750656127929688, + "learning_rate": 2.0819112627986347e-07, + "loss": 0.1048, + "num_tokens": 58726770.0, + "reward": 0.5595703125, + "reward_std": 0.2406485378742218, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.6962890625, - "rewards/tag_count_reward/std": 0.28029879927635193, + "rewards/tag_count_reward/mean": 0.4150390625, + "rewards/tag_count_reward/std": 0.16657714545726776, "step": 62 }, { @@ -1813,27 +1813,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.19140625, + "completions/clipped_ratio": 0.49609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 1083.150390625, - "completions/mean_terminated_length": 854.7560424804688, - "completions/min_length": 25.0, - "completions/min_terminated_length": 25.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1565.802734375, + "completions/mean_terminated_length": 1091.0814208984375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, "epoch": 0.021507211743620382, - "grad_norm": 0.159409761428833, - "kl": 0.0135955810546875, - "learning_rate": 2.1232876712328765e-07, - "loss": 0.1606, - "num_tokens": 52200380.0, - "reward": 0.82275390625, - "reward_std": 0.3297857344150543, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, + "grad_norm": 0.12250429391860962, + "kl": 0.0005865097045898438, + "learning_rate": 2.1160409556313992e-07, + "loss": 0.1173, + "num_tokens": 59611517.0, + "reward": 0.4775390625, + "reward_std": 0.20324724912643433, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.68994140625, - "rewards/tag_count_reward/std": 0.27702537178993225, + "rewards/tag_count_reward/mean": 0.4130859375, + "rewards/tag_count_reward/std": 0.17490464448928833, "step": 63 }, { @@ -1842,27 +1842,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.154296875, + "completions/clipped_ratio": 0.517578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 1013.60546875, - "completions/mean_terminated_length": 824.8822021484375, - "completions/min_length": 63.0, - "completions/min_terminated_length": 63.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1588.896484375, + "completions/mean_terminated_length": 1096.3360595703125, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, "epoch": 0.021848596057011182, - "grad_norm": 0.16638486087322235, - "kl": 0.0105438232421875, - "learning_rate": 2.1575342465753425e-07, - "loss": 0.1404, - "num_tokens": 52798274.0, - "reward": 0.91357421875, - "reward_std": 0.3621740937232971, - "rewards/accuracy_reward/mean": 0.189453125, - "rewards/accuracy_reward/std": 0.3922513723373413, + "grad_norm": 0.1131078451871872, + "kl": 0.0005655288696289062, + "learning_rate": 2.1501706484641638e-07, + "loss": 0.085, + "num_tokens": 60503960.0, + "reward": 0.53271484375, + "reward_std": 0.24876108765602112, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.72412109375, - "rewards/tag_count_reward/std": 0.27168965339660645, + "rewards/tag_count_reward/mean": 0.41357421875, + "rewards/tag_count_reward/std": 0.18762163817882538, "step": 64 }, { @@ -1871,27 +1871,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.21875, + "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1173.52734375, - "completions/mean_terminated_length": 928.6749877929688, - "completions/min_length": 40.0, - "completions/min_terminated_length": 40.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1626.666015625, + "completions/mean_terminated_length": 1118.159423828125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, "epoch": 0.02218998037040198, - "grad_norm": 0.12891757488250732, - "kl": 0.0115509033203125, - "learning_rate": 2.191780821917808e-07, - "loss": 0.1843, - "num_tokens": 53479248.0, - "reward": 0.8671875, - "reward_std": 0.35664865374565125, - "rewards/accuracy_reward/mean": 0.18359375, - "rewards/accuracy_reward/std": 0.3875311613082886, + "grad_norm": 0.1046229749917984, + "kl": 0.0005731582641601562, + "learning_rate": 2.1843003412969284e-07, + "loss": 0.0843, + "num_tokens": 61416941.0, + "reward": 0.494140625, + "reward_std": 0.2173539400100708, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.68359375, - "rewards/tag_count_reward/std": 0.29382818937301636, + "rewards/tag_count_reward/mean": 0.396484375, + "rewards/tag_count_reward/std": 0.17282997071743011, "step": 65 }, { @@ -1900,27 +1900,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.271484375, + "completions/clipped_ratio": 0.548828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1246.38671875, - "completions/mean_terminated_length": 947.6622314453125, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1634.9140625, + "completions/mean_terminated_length": 1132.41552734375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.02253136468379278, - "grad_norm": 0.12625646591186523, - "kl": 0.00994873046875, - "learning_rate": 2.2260273972602739e-07, - "loss": 0.1674, - "num_tokens": 54194758.0, - "reward": 0.73876953125, - "reward_std": 0.2965083122253418, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, + "grad_norm": 0.10851604491472244, + "kl": 0.0005245208740234375, + "learning_rate": 2.2184300341296927e-07, + "loss": 0.0819, + "num_tokens": 62331377.0, + "reward": 0.46044921875, + "reward_std": 0.19750535488128662, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.66259765625, - "rewards/tag_count_reward/std": 0.28932827711105347, + "rewards/tag_count_reward/mean": 0.39599609375, + "rewards/tag_count_reward/std": 0.17003679275512695, "step": 66 }, { @@ -1929,27 +1929,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.251953125, + "completions/clipped_ratio": 0.654296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1216.79296875, - "completions/mean_terminated_length": 936.830322265625, - "completions/min_length": 86.0, - "completions/min_terminated_length": 86.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1768.828125, + "completions/mean_terminated_length": 1240.4520263671875, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, "epoch": 0.02287274899718358, - "grad_norm": 0.12288202345371246, - "kl": 0.0096435546875, - "learning_rate": 2.2602739726027396e-07, - "loss": 0.1781, - "num_tokens": 54886236.0, - "reward": 0.8212890625, - "reward_std": 0.3608725070953369, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, + "grad_norm": 0.09718859195709229, + "kl": 0.0005950927734375, + "learning_rate": 2.2525597269624572e-07, + "loss": 0.0878, + "num_tokens": 63305497.0, + "reward": 0.4404296875, + "reward_std": 0.2352093756198883, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.6943359375, - "rewards/tag_count_reward/std": 0.2993423044681549, + "rewards/tag_count_reward/mean": 0.3681640625, + "rewards/tag_count_reward/std": 0.16088110208511353, "step": 67 }, { @@ -1958,27 +1958,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2734375, + "completions/clipped_ratio": 0.638671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1220.8671875, - "completions/mean_terminated_length": 909.5806274414062, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1706.013671875, + "completions/mean_terminated_length": 1101.52978515625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.02321413331057438, - "grad_norm": 0.2134619951248169, - "kl": 0.0125885009765625, - "learning_rate": 2.2945205479452055e-07, - "loss": 0.1652, - "num_tokens": 55588616.0, - "reward": 0.85107421875, - "reward_std": 0.3770219385623932, - "rewards/accuracy_reward/mean": 0.173828125, - "rewards/accuracy_reward/std": 0.3793322443962097, + "grad_norm": 0.09850999712944031, + "kl": 0.0006017684936523438, + "learning_rate": 2.2866894197952215e-07, + "loss": 0.0884, + "num_tokens": 64256272.0, + "reward": 0.44970703125, + "reward_std": 0.21088473498821259, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.67724609375, - "rewards/tag_count_reward/std": 0.30800607800483704, + "rewards/tag_count_reward/mean": 0.36376953125, + "rewards/tag_count_reward/std": 0.1621493548154831, "step": 68 }, { @@ -1987,27 +1987,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.33203125, + "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1318.947265625, - "completions/mean_terminated_length": 956.5526123046875, - "completions/min_length": 95.0, - "completions/min_terminated_length": 95.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1809.505859375, + "completions/mean_terminated_length": 1284.8187255859375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, "epoch": 0.02355551762396518, - "grad_norm": 0.13541601598262787, - "kl": 0.0112457275390625, - "learning_rate": 2.328767123287671e-07, - "loss": 0.2252, - "num_tokens": 56348333.0, - "reward": 0.66455078125, - "reward_std": 0.2940008044242859, - "rewards/accuracy_reward/mean": 0.018145160749554634, - "rewards/accuracy_reward/std": 0.1336110383272171, - "rewards/format_reward/mean": 0.0, - "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.64697265625, - "rewards/tag_count_reward/std": 0.3124231696128845, + "grad_norm": 0.09798890352249146, + "kl": 0.0005674362182617188, + "learning_rate": 2.3208191126279864e-07, + "loss": 0.0903, + "num_tokens": 65267155.0, + "reward": 0.36962890625, + "reward_std": 0.1394367218017578, + "rewards/accuracy_reward/mean": 0.016129031777381897, + "rewards/accuracy_reward/std": 0.12609896063804626, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.35400390625, + "rewards/tag_count_reward/std": 0.1443610042333603, "step": 69 }, { @@ -2016,27 +2016,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.240234375, + "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 1215.466796875, - "completions/mean_terminated_length": 952.2236328125, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1678.69921875, + "completions/mean_terminated_length": 1203.884033203125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.02389690193735598, - "grad_norm": 0.12335766851902008, - "kl": 0.0104522705078125, - "learning_rate": 2.363013698630137e-07, - "loss": 0.206, - "num_tokens": 57049292.0, - "reward": 0.82177734375, - "reward_std": 0.35854047536849976, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, + "grad_norm": 0.09649016708135605, + "kl": 0.0005521774291992188, + "learning_rate": 2.354948805460751e-07, + "loss": 0.1031, + "num_tokens": 66205289.0, + "reward": 0.48681640625, + "reward_std": 0.23590239882469177, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.69677734375, - "rewards/tag_count_reward/std": 0.2987651824951172, + "rewards/tag_count_reward/mean": 0.39501953125, + "rewards/tag_count_reward/std": 0.17441430687904358, "step": 70 }, { @@ -2045,27 +2045,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.25390625, + "completions/clipped_ratio": 0.60546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 1196.556640625, - "completions/mean_terminated_length": 906.7984619140625, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1728.423828125, + "completions/mean_terminated_length": 1237.985107421875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.02423828625074678, - "grad_norm": 0.7646471261978149, - "kl": 0.0182647705078125, - "learning_rate": 2.3972602739726023e-07, - "loss": 0.2167, - "num_tokens": 57738057.0, - "reward": 0.7978515625, - "reward_std": 0.3488779664039612, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, + "grad_norm": 0.09022075682878494, + "kl": 0.0005817413330078125, + "learning_rate": 2.3890784982935155e-07, + "loss": 0.0772, + "num_tokens": 67166370.0, + "reward": 0.43994140625, + "reward_std": 0.16247710585594177, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.6884765625, - "rewards/tag_count_reward/std": 0.3090671896934509, + "rewards/tag_count_reward/mean": 0.38134765625, + "rewards/tag_count_reward/std": 0.17263565957546234, "step": 71 }, { @@ -2074,27 +2074,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.201171875, + "completions/clipped_ratio": 0.59765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1155.95703125, - "completions/mean_terminated_length": 931.310546875, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1685.384765625, + "completions/mean_terminated_length": 1146.74267578125, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, "epoch": 0.02457967056413758, - "grad_norm": 0.13287417590618134, - "kl": 0.0117034912109375, - "learning_rate": 2.4315068493150685e-07, - "loss": 0.1464, - "num_tokens": 58416035.0, - "reward": 0.80859375, - "reward_std": 0.345120906829834, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, + "grad_norm": 0.09682461619377136, + "kl": 0.0006361007690429688, + "learning_rate": 2.42320819112628e-07, + "loss": 0.0901, + "num_tokens": 68115415.0, + "reward": 0.43603515625, + "reward_std": 0.19190119206905365, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.720703125, - "rewards/tag_count_reward/std": 0.2868976593017578, + "rewards/tag_count_reward/mean": 0.38330078125, + "rewards/tag_count_reward/std": 0.16678567230701447, "step": 72 }, { @@ -2103,27 +2103,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2109375, + "completions/clipped_ratio": 0.5234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1118.962890625, - "completions/mean_terminated_length": 870.6064453125, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1604.630859375, + "completions/mean_terminated_length": 1117.651611328125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, "epoch": 0.024921054877528378, - "grad_norm": 0.13646827638149261, - "kl": 0.01141357421875, - "learning_rate": 2.465753424657534e-07, - "loss": 0.1898, - "num_tokens": 59075296.0, - "reward": 0.8046875, - "reward_std": 0.32949909567832947, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, + "grad_norm": 0.10817335546016693, + "kl": 0.0006189346313476562, + "learning_rate": 2.457337883959044e-07, + "loss": 0.0844, + "num_tokens": 69023338.0, + "reward": 0.45166015625, + "reward_std": 0.17577175796031952, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.705078125, - "rewards/tag_count_reward/std": 0.2908143401145935, + "rewards/tag_count_reward/mean": 0.39501953125, + "rewards/tag_count_reward/std": 0.1494922935962677, "step": 73 }, { @@ -2132,27 +2132,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.205078125, + "completions/clipped_ratio": 0.537109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 1106.208984375, - "completions/mean_terminated_length": 863.2407836914062, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1612.94921875, + "completions/mean_terminated_length": 1108.1434326171875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, "epoch": 0.025262439190919177, - "grad_norm": 0.14528246223926544, - "kl": 0.011138916015625, - "learning_rate": 2.5e-07, - "loss": 0.1896, - "num_tokens": 59715547.0, - "reward": 0.91357421875, - "reward_std": 0.3325057923793793, - "rewards/accuracy_reward/mean": 0.181640625, - "rewards/accuracy_reward/std": 0.38592514395713806, + "grad_norm": 0.10391208529472351, + "kl": 0.00058746337890625, + "learning_rate": 2.4914675767918084e-07, + "loss": 0.0917, + "num_tokens": 69923040.0, + "reward": 0.51416015625, + "reward_std": 0.2049870491027832, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.73193359375, - "rewards/tag_count_reward/std": 0.28930845856666565, + "rewards/tag_count_reward/mean": 0.39697265625, + "rewards/tag_count_reward/std": 0.16846729815006256, "step": 74 }, { @@ -2161,27 +2161,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.240234375, + "completions/clipped_ratio": 0.61328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1172.107421875, - "completions/mean_terminated_length": 895.1542358398438, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/mean_length": 1733.513671875, + "completions/mean_terminated_length": 1234.7828369140625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, "epoch": 0.025603823504309977, - "grad_norm": 0.13012762367725372, - "kl": 0.01123046875, - "learning_rate": 2.5342465753424656e-07, - "loss": 0.2159, - "num_tokens": 60395666.0, - "reward": 0.7939453125, - "reward_std": 0.35236185789108276, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, + "grad_norm": 0.09485740214586258, + "kl": 0.000629425048828125, + "learning_rate": 2.525597269624573e-07, + "loss": 0.0839, + "num_tokens": 70890599.0, + "reward": 0.44921875, + "reward_std": 0.22727754712104797, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7099609375, - "rewards/tag_count_reward/std": 0.30102303624153137, + "rewards/tag_count_reward/mean": 0.37890625, + "rewards/tag_count_reward/std": 0.18302303552627563, "step": 75 }, { @@ -2190,27 +2190,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.173828125, + "completions/clipped_ratio": 0.55859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1107.95703125, - "completions/mean_terminated_length": 910.1702270507812, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/mean_length": 1676.318359375, + "completions/mean_terminated_length": 1205.960205078125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, "epoch": 0.025945207817700777, - "grad_norm": 0.13782428205013275, - "kl": 0.0135498046875, - "learning_rate": 2.568493150684932e-07, - "loss": 0.1613, - "num_tokens": 61032620.0, - "reward": 0.92822265625, - "reward_std": 0.3581881821155548, - "rewards/accuracy_reward/mean": 0.173828125, - "rewards/accuracy_reward/std": 0.3793322443962097, + "grad_norm": 0.09795970469713211, + "kl": 0.0005970001220703125, + "learning_rate": 2.5597269624573375e-07, + "loss": 0.0597, + "num_tokens": 71818554.0, + "reward": 0.50341796875, + "reward_std": 0.24869966506958008, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.75439453125, - "rewards/tag_count_reward/std": 0.28814682364463806, + "rewards/tag_count_reward/mean": 0.39990234375, + "rewards/tag_count_reward/std": 0.18135979771614075, "step": 76 }, { @@ -2219,27 +2219,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.216796875, + "completions/clipped_ratio": 0.666015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1145.453125, - "completions/mean_terminated_length": 895.6209716796875, - "completions/min_length": 73.0, - "completions/min_terminated_length": 73.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1765.73828125, + "completions/mean_terminated_length": 1202.865478515625, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, "epoch": 0.026286592131091576, - "grad_norm": 0.13462021946907043, - "kl": 0.012237548828125, - "learning_rate": 2.602739726027397e-07, - "loss": 0.2469, - "num_tokens": 61696596.0, - "reward": 0.8056640625, - "reward_std": 0.32687926292419434, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, + "grad_norm": 0.08727016299962997, + "kl": 0.0006475448608398438, + "learning_rate": 2.593856655290102e-07, + "loss": 0.0747, + "num_tokens": 72800116.0, + "reward": 0.40673828125, + "reward_std": 0.1413293480873108, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7216796875, - "rewards/tag_count_reward/std": 0.29828062653541565, + "rewards/tag_count_reward/mean": 0.35791015625, + "rewards/tag_count_reward/std": 0.1562492400407791, "step": 77 }, { @@ -2248,27 +2248,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.18359375, + "completions/clipped_ratio": 0.515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1071.12109375, - "completions/mean_terminated_length": 851.440185546875, - "completions/min_length": 85.0, - "completions/min_terminated_length": 85.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1620.638671875, + "completions/mean_terminated_length": 1165.70556640625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, "epoch": 0.026627976444482376, - "grad_norm": 0.14011922478675842, - "kl": 0.0136871337890625, - "learning_rate": 2.6369863013698626e-07, - "loss": 0.2106, - "num_tokens": 62326322.0, - "reward": 0.82958984375, - "reward_std": 0.3330492377281189, + "grad_norm": 0.09837228059768677, + "kl": 0.000667572021484375, + "learning_rate": 2.627986348122867e-07, + "loss": 0.0904, + "num_tokens": 73711195.0, + "reward": 0.50830078125, + "reward_std": 0.2117748260498047, "rewards/accuracy_reward/mean": 0.1088709682226181, "rewards/accuracy_reward/std": 0.31179171800613403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.72412109375, - "rewards/tag_count_reward/std": 0.29706525802612305, + "rewards/tag_count_reward/mean": 0.40283203125, + "rewards/tag_count_reward/std": 0.17192016541957855, "step": 78 }, { @@ -2277,27 +2277,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.251953125, + "completions/clipped_ratio": 0.666015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1237.875, - "completions/mean_terminated_length": 965.0130615234375, - "completions/min_length": 222.0, - "completions/min_terminated_length": 222.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1789.73828125, + "completions/mean_terminated_length": 1274.72509765625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, "epoch": 0.026969360757873175, - "grad_norm": 0.12821093201637268, - "kl": 0.0119171142578125, - "learning_rate": 2.671232876712329e-07, - "loss": 0.2283, - "num_tokens": 63040738.0, - "reward": 0.8056640625, - "reward_std": 0.36292368173599243, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, + "grad_norm": 0.09820061922073364, + "kl": 0.0006532669067382812, + "learning_rate": 2.6621160409556315e-07, + "loss": 0.091, + "num_tokens": 74708165.0, + "reward": 0.39404296875, + "reward_std": 0.1636386513710022, + "rewards/accuracy_reward/mean": 0.029296875, + "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7294921875, - "rewards/tag_count_reward/std": 0.3125229477882385, + "rewards/tag_count_reward/mean": 0.36474609375, + "rewards/tag_count_reward/std": 0.16813799738883972, "step": 79 }, { @@ -2306,27 +2306,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.189453125, + "completions/clipped_ratio": 0.515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 1023.4453125, - "completions/mean_terminated_length": 783.9711303710938, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1564.798828125, + "completions/mean_terminated_length": 1050.42333984375, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, "epoch": 0.027310745071263975, - "grad_norm": 0.15886980295181274, - "kl": 0.014007568359375, - "learning_rate": 2.7054794520547945e-07, - "loss": 0.2187, - "num_tokens": 63649078.0, - "reward": 0.8125, - "reward_std": 0.34062469005584717, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, + "grad_norm": 0.11139057576656342, + "kl": 0.000659942626953125, + "learning_rate": 2.696245733788396e-07, + "loss": 0.079, + "num_tokens": 75593678.0, + "reward": 0.4638671875, + "reward_std": 0.22049108147621155, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.755859375, - "rewards/tag_count_reward/std": 0.2908669114112854, + "rewards/tag_count_reward/mean": 0.4091796875, + "rewards/tag_count_reward/std": 0.1865164041519165, "step": 80 }, { @@ -2335,27 +2335,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1640625, + "completions/clipped_ratio": 0.435546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 1013.123046875, - "completions/mean_terminated_length": 810.016357421875, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1497.166015625, + "completions/mean_terminated_length": 1072.1280517578125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.027652129384654774, - "grad_norm": 0.14282691478729248, - "kl": 0.01397705078125, - "learning_rate": 2.73972602739726e-07, - "loss": 0.1899, - "num_tokens": 64239637.0, - "reward": 0.9736328125, - "reward_std": 0.3688603937625885, - "rewards/accuracy_reward/mean": 0.185546875, - "rewards/accuracy_reward/std": 0.38912075757980347, + "grad_norm": 0.10583925992250443, + "kl": 0.0006046295166015625, + "learning_rate": 2.73037542662116e-07, + "loss": 0.0877, + "num_tokens": 76432067.0, + "reward": 0.56884765625, + "reward_std": 0.23239648342132568, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7880859375, - "rewards/tag_count_reward/std": 0.2841450273990631, + "rewards/tag_count_reward/mean": 0.43017578125, + "rewards/tag_count_reward/std": 0.1780041754245758, "step": 81 }, { @@ -2364,27 +2364,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.25390625, + "completions/clipped_ratio": 0.654296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 1196.462890625, - "completions/mean_terminated_length": 906.6727905273438, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1764.62890625, + "completions/mean_terminated_length": 1228.3050537109375, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, "epoch": 0.027993513698045574, - "grad_norm": 0.1358703076839447, - "kl": 0.012176513671875, - "learning_rate": 2.773972602739726e-07, - "loss": 0.2331, - "num_tokens": 64931634.0, - "reward": 0.86865234375, - "reward_std": 0.40526849031448364, - "rewards/accuracy_reward/mean": 0.166015625, - "rewards/accuracy_reward/std": 0.3724585771560669, + "grad_norm": 0.09084117412567139, + "kl": 0.0006618499755859375, + "learning_rate": 2.764505119453925e-07, + "loss": 0.098, + "num_tokens": 77414965.0, + "reward": 0.4375, + "reward_std": 0.2354152500629425, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.70263671875, - "rewards/tag_count_reward/std": 0.31017935276031494, + "rewards/tag_count_reward/mean": 0.361328125, + "rewards/tag_count_reward/std": 0.15736515820026398, "step": 82 }, { @@ -2393,27 +2393,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.14453125, + "completions/clipped_ratio": 0.572265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 996.0390625, - "completions/mean_terminated_length": 818.3104858398438, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1673.951171875, + "completions/mean_terminated_length": 1173.5113525390625, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, "epoch": 0.028334898011436373, - "grad_norm": 0.165467768907547, - "kl": 0.015838623046875, - "learning_rate": 2.8082191780821916e-07, - "loss": 0.2608, - "num_tokens": 65515750.0, - "reward": 0.8486328125, - "reward_std": 0.3327805995941162, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, + "grad_norm": 0.10642743855714798, + "kl": 0.00067901611328125, + "learning_rate": 2.798634812286689e-07, + "loss": 0.1359, + "num_tokens": 78346172.0, + "reward": 0.4423828125, + "reward_std": 0.2006877064704895, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7705078125, - "rewards/tag_count_reward/std": 0.2816455364227295, + "rewards/tag_count_reward/mean": 0.3857421875, + "rewards/tag_count_reward/std": 0.16955633461475372, "step": 83 }, { @@ -2422,27 +2422,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.23046875, + "completions/clipped_ratio": 0.552734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1168.208984375, - "completions/mean_terminated_length": 904.71826171875, - "completions/min_length": 40.0, - "completions/min_terminated_length": 40.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1660.591796875, + "completions/mean_terminated_length": 1181.8297119140625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, "epoch": 0.028676282324827173, - "grad_norm": 0.1360129863023758, - "kl": 0.013458251953125, - "learning_rate": 2.842465753424658e-07, - "loss": 0.1568, - "num_tokens": 66188513.0, - "reward": 0.83447265625, - "reward_std": 0.3465424180030823, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, + "grad_norm": 0.10082748532295227, + "kl": 0.000713348388671875, + "learning_rate": 2.8327645051194536e-07, + "loss": 0.0626, + "num_tokens": 79271035.0, + "reward": 0.44580078125, + "reward_std": 0.16366678476333618, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.73486328125, - "rewards/tag_count_reward/std": 0.3083006739616394, + "rewards/tag_count_reward/mean": 0.39306640625, + "rewards/tag_count_reward/std": 0.17037363350391388, "step": 84 }, { @@ -2451,27 +2451,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.234375, + "completions/clipped_ratio": 0.662109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1158.1171875, - "completions/mean_terminated_length": 885.7040405273438, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1757.416015625, + "completions/mean_terminated_length": 1188.0057373046875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.029017666638217973, - "grad_norm": 0.1278849095106125, - "kl": 0.0155487060546875, - "learning_rate": 2.876712328767123e-07, - "loss": 0.2267, - "num_tokens": 66857309.0, - "reward": 0.86767578125, - "reward_std": 0.3450787365436554, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, + "grad_norm": 0.09040378034114838, + "kl": 0.0007390975952148438, + "learning_rate": 2.8668941979522184e-07, + "loss": 0.0905, + "num_tokens": 80246672.0, + "reward": 0.431640625, + "reward_std": 0.20461226999759674, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.72705078125, - "rewards/tag_count_reward/std": 0.303414523601532, + "rewards/tag_count_reward/mean": 0.3671875, + "rewards/tag_count_reward/std": 0.16083654761314392, "step": 85 }, { @@ -2480,27 +2480,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.263671875, + "completions/clipped_ratio": 0.607421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 1200.580078125, - "completions/mean_terminated_length": 897.1273193359375, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1716.48828125, + "completions/mean_terminated_length": 1203.55224609375, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, "epoch": 0.029359050951608772, - "grad_norm": 0.13545668125152588, - "kl": 0.0130767822265625, - "learning_rate": 2.9109589041095887e-07, - "loss": 0.1782, - "num_tokens": 67557238.0, - "reward": 0.85205078125, - "reward_std": 0.3195267915725708, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, + "grad_norm": 0.09619715064764023, + "kl": 0.0007219314575195312, + "learning_rate": 2.9010238907849827e-07, + "loss": 0.0785, + "num_tokens": 81210746.0, + "reward": 0.4541015625, + "reward_std": 0.17044886946678162, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.72119140625, - "rewards/tag_count_reward/std": 0.31166985630989075, + "rewards/tag_count_reward/mean": 0.3701171875, + "rewards/tag_count_reward/std": 0.15632642805576324, "step": 86 }, { @@ -2509,27 +2509,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.15234375, + "completions/clipped_ratio": 0.52734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 964.841796875, - "completions/mean_terminated_length": 770.1727905273438, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1533.15234375, + "completions/mean_terminated_length": 958.7354736328125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.02970043526499957, - "grad_norm": 0.1605350822210312, - "kl": 0.01654052734375, - "learning_rate": 2.945205479452055e-07, - "loss": 0.1598, - "num_tokens": 68130005.0, - "reward": 0.87548828125, - "reward_std": 0.33797964453697205, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, + "grad_norm": 0.11499039828777313, + "kl": 0.0007534027099609375, + "learning_rate": 2.935153583617747e-07, + "loss": 0.1324, + "num_tokens": 82074488.0, + "reward": 0.43310546875, + "reward_std": 0.1939796805381775, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.77001953125, - "rewards/tag_count_reward/std": 0.30119559168815613, + "rewards/tag_count_reward/mean": 0.37841796875, + "rewards/tag_count_reward/std": 0.16841623187065125, "step": 87 }, { @@ -2538,27 +2538,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.228515625, + "completions/clipped_ratio": 0.53515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 1131.421875, - "completions/mean_terminated_length": 859.9291381835938, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1669.015625, + "completions/mean_terminated_length": 1232.7059326171875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, "epoch": 0.030041819578390375, - "grad_norm": 0.1418033391237259, - "kl": 0.0140380859375, - "learning_rate": 2.9794520547945206e-07, - "loss": 0.1696, - "num_tokens": 68796397.0, - "reward": 0.90576171875, - "reward_std": 0.36803168058395386, - "rewards/accuracy_reward/mean": 0.1640625, - "rewards/accuracy_reward/std": 0.37069445848464966, + "grad_norm": 0.10196422040462494, + "kl": 0.0007295608520507812, + "learning_rate": 2.969283276450512e-07, + "loss": 0.0908, + "num_tokens": 83016128.0, + "reward": 0.466796875, + "reward_std": 0.2298911064863205, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.74169921875, - "rewards/tag_count_reward/std": 0.3073694407939911, + "rewards/tag_count_reward/mean": 0.392578125, + "rewards/tag_count_reward/std": 0.16006234288215637, "step": 88 }, { @@ -2567,27 +2567,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.19921875, + "completions/clipped_ratio": 0.48828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 1130.34375, - "completions/mean_terminated_length": 902.0487670898438, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1618.51171875, + "completions/mean_terminated_length": 1208.6947021484375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, "epoch": 0.030383203891781174, - "grad_norm": 0.14273065328598022, - "kl": 0.0142974853515625, - "learning_rate": 3.013698630136986e-07, - "loss": 0.2161, - "num_tokens": 69448781.0, - "reward": 0.859375, - "reward_std": 0.36156928539276123, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, + "grad_norm": 0.11078286170959473, + "kl": 0.0007228851318359375, + "learning_rate": 3.0034129692832767e-07, + "loss": 0.1089, + "num_tokens": 83918454.0, + "reward": 0.47412109375, + "reward_std": 0.2257877141237259, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.763671875, - "rewards/tag_count_reward/std": 0.2910245656967163, + "rewards/tag_count_reward/mean": 0.41357421875, + "rewards/tag_count_reward/std": 0.17757421731948853, "step": 89 }, { @@ -2596,27 +2596,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.22265625, + "completions/clipped_ratio": 0.583984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 1153.517578125, - "completions/mean_terminated_length": 897.3090209960938, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1640.728515625, + "completions/mean_terminated_length": 1069.018798828125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, "epoch": 0.030724588205171974, - "grad_norm": 0.1517585963010788, - "kl": 0.01446533203125, - "learning_rate": 3.047945205479452e-07, - "loss": 0.2051, - "num_tokens": 70114438.0, - "reward": 0.86328125, - "reward_std": 0.3575511574745178, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, + "grad_norm": 0.10113243758678436, + "kl": 0.0007505416870117188, + "learning_rate": 3.037542662116041e-07, + "loss": 0.0958, + "num_tokens": 84833563.0, + "reward": 0.48681640625, + "reward_std": 0.2212250530719757, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.75390625, - "rewards/tag_count_reward/std": 0.29672771692276, + "rewards/tag_count_reward/mean": 0.39111328125, + "rewards/tag_count_reward/std": 0.18100644648075104, "step": 90 }, { @@ -2625,27 +2625,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1953125, + "completions/clipped_ratio": 0.470703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 1065.25390625, - "completions/mean_terminated_length": 826.7233276367188, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1579.775390625, + "completions/mean_terminated_length": 1163.3837890625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, "epoch": 0.031065972518562773, - "grad_norm": 0.13828644156455994, - "kl": 0.0145416259765625, - "learning_rate": 3.0821917808219176e-07, - "loss": 0.2289, - "num_tokens": 70735144.0, - "reward": 0.884765625, - "reward_std": 0.335534006357193, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, + "grad_norm": 0.11080970615148544, + "kl": 0.0007219314575195312, + "learning_rate": 3.0716723549488053e-07, + "loss": 0.1205, + "num_tokens": 85717704.0, + "reward": 0.5, + "reward_std": 0.2212325781583786, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.779296875, - "rewards/tag_count_reward/std": 0.3006371259689331, + "rewards/tag_count_reward/mean": 0.40625, + "rewards/tag_count_reward/std": 0.1617843359708786, "step": 91 }, { @@ -2654,27 +2654,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.248046875, + "completions/clipped_ratio": 0.673828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1210.0546875, - "completions/mean_terminated_length": 933.6415405273438, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1804.71875, + "completions/mean_terminated_length": 1302.1318359375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, "epoch": 0.03140735683195357, - "grad_norm": 0.13442397117614746, - "kl": 0.0135650634765625, - "learning_rate": 3.116438356164384e-07, - "loss": 0.1872, - "num_tokens": 71439316.0, - "reward": 0.80322265625, - "reward_std": 0.3439953923225403, - "rewards/accuracy_reward/mean": 0.08749999850988388, - "rewards/accuracy_reward/std": 0.2828611731529236, - "rewards/format_reward/mean": 0.0, - "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.72119140625, - "rewards/tag_count_reward/std": 0.30811774730682373, + "grad_norm": 0.0941099226474762, + "kl": 0.0007429122924804688, + "learning_rate": 3.10580204778157e-07, + "loss": 0.1231, + "num_tokens": 86726344.0, + "reward": 0.396484375, + "reward_std": 0.18601566553115845, + "rewards/accuracy_reward/mean": 0.04374999925494194, + "rewards/accuracy_reward/std": 0.20475177466869354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.35546875, + "rewards/tag_count_reward/std": 0.1462491750717163, "step": 92 }, { @@ -2683,27 +2683,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.16015625, + "completions/clipped_ratio": 0.408203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 967.873046875, - "completions/mean_terminated_length": 761.8953247070312, - "completions/min_length": 21.0, - "completions/min_terminated_length": 21.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1465.46875, + "completions/mean_terminated_length": 1063.65673828125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, "epoch": 0.03174874114534437, - "grad_norm": 0.15893946588039398, - "kl": 0.0166015625, - "learning_rate": 3.150684931506849e-07, - "loss": 0.1858, - "num_tokens": 72004003.0, - "reward": 0.94580078125, - "reward_std": 0.3452272117137909, - "rewards/accuracy_reward/mean": 0.142578125, - "rewards/accuracy_reward/std": 0.3499840497970581, + "grad_norm": 0.12296649813652039, + "kl": 0.0007867813110351562, + "learning_rate": 3.1399317406143344e-07, + "loss": 0.0957, + "num_tokens": 87545800.0, + "reward": 0.56103515625, + "reward_std": 0.2506997883319855, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80322265625, - "rewards/tag_count_reward/std": 0.28364479541778564, + "rewards/tag_count_reward/mean": 0.44970703125, + "rewards/tag_count_reward/std": 0.19036197662353516, "step": 93 }, { @@ -2712,27 +2712,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2578125, + "completions/clipped_ratio": 0.560546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 1193.263671875, - "completions/mean_terminated_length": 896.3552856445312, - "completions/min_length": 58.0, - "completions/min_terminated_length": 58.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1676.126953125, + "completions/mean_terminated_length": 1201.7822265625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, "epoch": 0.03209012545873517, - "grad_norm": 0.15196147561073303, - "kl": 0.014251708984375, - "learning_rate": 3.1849315068493147e-07, - "loss": 0.2466, - "num_tokens": 72692618.0, - "reward": 0.80078125, - "reward_std": 0.3593321740627289, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, + "grad_norm": 0.10059032589197159, + "kl": 0.0007867813110351562, + "learning_rate": 3.1740614334470987e-07, + "loss": 0.104, + "num_tokens": 88481641.0, + "reward": 0.4453125, + "reward_std": 0.20004956424236298, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.732421875, - "rewards/tag_count_reward/std": 0.31465137004852295, + "rewards/tag_count_reward/mean": 0.400390625, + "rewards/tag_count_reward/std": 0.19305415451526642, "step": 94 }, { @@ -2741,27 +2741,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.19140625, + "completions/clipped_ratio": 0.580078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1134.75390625, - "completions/mean_terminated_length": 918.5748901367188, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1689.634765625, + "completions/mean_terminated_length": 1194.5906982421875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, "epoch": 0.03243150977212597, - "grad_norm": 0.1329055279493332, - "kl": 0.013458251953125, - "learning_rate": 3.219178082191781e-07, - "loss": 0.1661, - "num_tokens": 73355356.0, - "reward": 0.90966796875, - "reward_std": 0.38059353828430176, - "rewards/accuracy_reward/mean": 0.146484375, - "rewards/accuracy_reward/std": 0.35393697023391724, + "grad_norm": 0.10083835572004318, + "kl": 0.0007677078247070312, + "learning_rate": 3.2081911262798635e-07, + "loss": 0.0985, + "num_tokens": 89428478.0, + "reward": 0.45068359375, + "reward_std": 0.22992177307605743, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.76318359375, - "rewards/tag_count_reward/std": 0.2945975065231323, + "rewards/tag_count_reward/mean": 0.38232421875, + "rewards/tag_count_reward/std": 0.1623731404542923, "step": 95 }, { @@ -2770,27 +2770,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.21875, + "completions/clipped_ratio": 0.556640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1084.638671875, - "completions/mean_terminated_length": 814.8974609375, - "completions/min_length": 95.0, - "completions/min_terminated_length": 95.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1665.900390625, + "completions/mean_terminated_length": 1186.1717529296875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.03277289408551677, - "grad_norm": 0.17101000249385834, - "kl": 0.0164794921875, - "learning_rate": 3.2534246575342466e-07, - "loss": 0.1787, - "num_tokens": 73980531.0, - "reward": 0.876953125, - "reward_std": 0.3465830683708191, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, + "grad_norm": 0.11242767423391342, + "kl": 0.000881195068359375, + "learning_rate": 3.242320819112628e-07, + "loss": 0.1267, + "num_tokens": 90351259.0, + "reward": 0.47412109375, + "reward_std": 0.21313250064849854, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.751953125, - "rewards/tag_count_reward/std": 0.3080716133117676, + "rewards/tag_count_reward/mean": 0.39599609375, + "rewards/tag_count_reward/std": 0.1763918250799179, "step": 96 }, { @@ -2799,27 +2799,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.203125, + "completions/clipped_ratio": 0.48828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 1056.6796875, - "completions/mean_terminated_length": 803.990234375, - "completions/min_length": 24.0, - "completions/min_terminated_length": 24.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1587.48828125, + "completions/mean_terminated_length": 1148.0687255859375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, "epoch": 0.03311427839890757, - "grad_norm": 0.15752506256103516, - "kl": 0.01513671875, - "learning_rate": 3.287671232876712e-07, - "loss": 0.2682, - "num_tokens": 74597599.0, - "reward": 0.90771484375, - "reward_std": 0.3453283905982971, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, + "grad_norm": 0.1085725799202919, + "kl": 0.0008459091186523438, + "learning_rate": 3.276450511945392e-07, + "loss": 0.1388, + "num_tokens": 91240101.0, + "reward": 0.49072265625, + "reward_std": 0.20506152510643005, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.77490234375, - "rewards/tag_count_reward/std": 0.29838111996650696, + "rewards/tag_count_reward/mean": 0.41455078125, + "rewards/tag_count_reward/std": 0.1787327527999878, "step": 97 }, { @@ -2828,27 +2828,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.263671875, + "completions/clipped_ratio": 0.650390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1179.80078125, - "completions/mean_terminated_length": 868.9071655273438, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1738.52734375, + "completions/mean_terminated_length": 1162.804443359375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, "epoch": 0.03345566271229837, - "grad_norm": 0.1455349624156952, - "kl": 0.0147705078125, - "learning_rate": 3.321917808219178e-07, - "loss": 0.2405, - "num_tokens": 75282553.0, - "reward": 0.81298828125, - "reward_std": 0.32863861322402954, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, + "grad_norm": 0.09691955894231796, + "kl": 0.000881195068359375, + "learning_rate": 3.3105802047781565e-07, + "loss": 0.0769, + "num_tokens": 92211123.0, + "reward": 0.41650390625, + "reward_std": 0.17518079280853271, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.73876953125, - "rewards/tag_count_reward/std": 0.3127991259098053, + "rewards/tag_count_reward/mean": 0.36376953125, + "rewards/tag_count_reward/std": 0.1636510044336319, "step": 98 }, { @@ -2857,27 +2857,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.216796875, + "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 1102.716796875, - "completions/mean_terminated_length": 841.0548706054688, - "completions/min_length": 20.0, - "completions/min_terminated_length": 20.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1590.169921875, + "completions/mean_terminated_length": 1071.2958984375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, "epoch": 0.03379704702568917, - "grad_norm": 0.1469789743423462, - "kl": 0.015625, - "learning_rate": 3.3561643835616436e-07, - "loss": 0.2115, - "num_tokens": 75924024.0, - "reward": 0.86279296875, - "reward_std": 0.34248465299606323, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, + "grad_norm": 0.12030370533466339, + "kl": 0.000919342041015625, + "learning_rate": 3.3447098976109213e-07, + "loss": 0.1244, + "num_tokens": 93102170.0, + "reward": 0.45556640625, + "reward_std": 0.21795539557933807, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.78271484375, - "rewards/tag_count_reward/std": 0.30007997155189514, + "rewards/tag_count_reward/mean": 0.41455078125, + "rewards/tag_count_reward/std": 0.19758372008800507, "step": 99 }, { @@ -2886,27 +2886,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.20703125, + "completions/clipped_ratio": 0.56640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 1135.515625, - "completions/mean_terminated_length": 897.28076171875, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1635.990234375, + "completions/mean_terminated_length": 1097.779296875, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, "epoch": 0.03413843133907997, - "grad_norm": 0.12671534717082977, - "kl": 0.0143890380859375, - "learning_rate": 3.39041095890411e-07, - "loss": 0.19, - "num_tokens": 76582544.0, - "reward": 0.90283203125, - "reward_std": 0.3243113160133362, - "rewards/accuracy_reward/mean": 0.12109375, - "rewards/accuracy_reward/std": 0.3265552520751953, + "grad_norm": 0.10042981058359146, + "kl": 0.0008440017700195312, + "learning_rate": 3.3788395904436856e-07, + "loss": 0.1006, + "num_tokens": 94016933.0, + "reward": 0.4833984375, + "reward_std": 0.21628820896148682, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.78173828125, - "rewards/tag_count_reward/std": 0.3034271001815796, + "rewards/tag_count_reward/mean": 0.4130859375, + "rewards/tag_count_reward/std": 0.2075263112783432, "step": 100 }, { @@ -2915,27 +2915,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1484375, + "completions/clipped_ratio": 0.59765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 1069.42578125, - "completions/mean_terminated_length": 898.8485717773438, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1745.943359375, + "completions/mean_terminated_length": 1297.25732421875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.03447981565247077, - "grad_norm": 0.14338794350624084, - "kl": 0.016845703125, - "learning_rate": 3.424657534246575e-07, - "loss": 0.2441, - "num_tokens": 77201354.0, - "reward": 0.91015625, - "reward_std": 0.35537150502204895, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, + "grad_norm": 0.10225279629230499, + "kl": 0.00095367431640625, + "learning_rate": 3.4129692832764504e-07, + "loss": 0.1114, + "num_tokens": 94982120.0, + "reward": 0.4267578125, + "reward_std": 0.20443692803382874, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.81640625, - "rewards/tag_count_reward/std": 0.280627578496933, + "rewards/tag_count_reward/mean": 0.3857421875, + "rewards/tag_count_reward/std": 0.16368384659290314, "step": 101 }, { @@ -2944,27 +2944,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.205078125, + "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1139.138671875, - "completions/mean_terminated_length": 904.6658325195312, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1687.296875, + "completions/mean_terminated_length": 1160.115478515625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, "epoch": 0.03482119996586157, - "grad_norm": 0.13609696924686432, - "kl": 0.016632080078125, - "learning_rate": 3.4589041095890407e-07, - "loss": 0.2475, - "num_tokens": 77861777.0, - "reward": 0.87158203125, - "reward_std": 0.3285045623779297, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, + "grad_norm": 0.0951334610581398, + "kl": 0.000873565673828125, + "learning_rate": 3.447098976109215e-07, + "loss": 0.1125, + "num_tokens": 95923200.0, + "reward": 0.419921875, + "reward_std": 0.17092673480510712, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80126953125, - "rewards/tag_count_reward/std": 0.28954946994781494, + "rewards/tag_count_reward/mean": 0.384765625, + "rewards/tag_count_reward/std": 0.17805851995944977, "step": 102 }, { @@ -2973,27 +2973,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2109375, + "completions/clipped_ratio": 0.49609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 1077.892578125, - "completions/mean_terminated_length": 818.5569458007812, - "completions/min_length": 60.0, - "completions/min_terminated_length": 60.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1586.826171875, + "completions/mean_terminated_length": 1132.8023681640625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, "epoch": 0.03516258427925237, - "grad_norm": 0.1515011191368103, - "kl": 0.017059326171875, - "learning_rate": 3.493150684931507e-07, - "loss": 0.1885, - "num_tokens": 78491514.0, - "reward": 0.88818359375, - "reward_std": 0.3469456434249878, - "rewards/accuracy_reward/mean": 0.10685484111309052, - "rewards/accuracy_reward/std": 0.3092404901981354, + "grad_norm": 0.11360824108123779, + "kl": 0.0009851455688476562, + "learning_rate": 3.4812286689419796e-07, + "loss": 0.1229, + "num_tokens": 96813511.0, + "reward": 0.46533203125, + "reward_std": 0.1948869526386261, + "rewards/accuracy_reward/mean": 0.04838709533214569, + "rewards/accuracy_reward/std": 0.21479946374893188, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.78466796875, - "rewards/tag_count_reward/std": 0.3002677857875824, + "rewards/tag_count_reward/mean": 0.41845703125, + "rewards/tag_count_reward/std": 0.1904422789812088, "step": 103 }, { @@ -3002,27 +3002,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.16796875, + "completions/clipped_ratio": 0.4375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1041.28125, - "completions/mean_terminated_length": 838.0469360351562, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1569.619140625, + "completions/mean_terminated_length": 1197.545166015625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, "epoch": 0.03550396859264317, - "grad_norm": 0.127638041973114, - "kl": 0.0147857666015625, - "learning_rate": 3.5273972602739726e-07, - "loss": 0.1355, - "num_tokens": 79106090.0, - "reward": 1.02099609375, - "reward_std": 0.39763882756233215, - "rewards/accuracy_reward/mean": 0.22782258689403534, - "rewards/accuracy_reward/std": 0.4198509752750397, + "grad_norm": 0.10358218848705292, + "kl": 0.000934600830078125, + "learning_rate": 3.515358361774744e-07, + "loss": 0.1045, + "num_tokens": 97698596.0, + "reward": 0.57763671875, + "reward_std": 0.2946711480617523, + "rewards/accuracy_reward/mean": 0.15322580933570862, + "rewards/accuracy_reward/std": 0.36056873202323914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80029296875, - "rewards/tag_count_reward/std": 0.28331783413887024, + "rewards/tag_count_reward/mean": 0.42919921875, + "rewards/tag_count_reward/std": 0.180350661277771, "step": 104 }, { @@ -3031,27 +3031,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1640625, + "completions/clipped_ratio": 0.595703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1055.228515625, - "completions/mean_terminated_length": 860.385498046875, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1664.892578125, + "completions/mean_terminated_length": 1100.41064453125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, "epoch": 0.03584535290603397, - "grad_norm": 0.24287424981594086, - "kl": 0.017822265625, - "learning_rate": 3.561643835616438e-07, - "loss": 0.1756, - "num_tokens": 79722751.0, - "reward": 0.94384765625, - "reward_std": 0.30647194385528564, - "rewards/accuracy_reward/mean": 0.12109375, - "rewards/accuracy_reward/std": 0.3265552520751953, + "grad_norm": 0.1326277107000351, + "kl": 0.001064300537109375, + "learning_rate": 3.5494880546075087e-07, + "loss": 0.1248, + "num_tokens": 98627405.0, + "reward": 0.44482421875, + "reward_std": 0.1768166720867157, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.82275390625, - "rewards/tag_count_reward/std": 0.27573922276496887, + "rewards/tag_count_reward/mean": 0.38818359375, + "rewards/tag_count_reward/std": 0.18325716257095337, "step": 105 }, { @@ -3060,27 +3060,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.19921875, + "completions/clipped_ratio": 0.48046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 1090.884765625, - "completions/mean_terminated_length": 852.773193359375, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1556.064453125, + "completions/mean_terminated_length": 1101.1165771484375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, "epoch": 0.03618673721942477, - "grad_norm": 0.14498759806156158, - "kl": 0.01666259765625, - "learning_rate": 3.595890410958904e-07, - "loss": 0.1711, - "num_tokens": 80364804.0, - "reward": 0.982421875, - "reward_std": 0.372316837310791, - "rewards/accuracy_reward/mean": 0.18359375, - "rewards/accuracy_reward/std": 0.3875311613082886, + "grad_norm": 0.11217821389436722, + "kl": 0.001010894775390625, + "learning_rate": 3.583617747440273e-07, + "loss": 0.1126, + "num_tokens": 99507630.0, + "reward": 0.55712890625, + "reward_std": 0.26952022314071655, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.798828125, - "rewards/tag_count_reward/std": 0.29018279910087585, + "rewards/tag_count_reward/mean": 0.43017578125, + "rewards/tag_count_reward/std": 0.19754503667354584, "step": 106 }, { @@ -3089,27 +3089,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.142578125, + "completions/clipped_ratio": 0.41015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1004.091796875, - "completions/mean_terminated_length": 830.5034790039062, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1474.470703125, + "completions/mean_terminated_length": 1075.658935546875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.036528121532815566, - "grad_norm": 0.1424538493156433, - "kl": 0.01788330078125, - "learning_rate": 3.6301369863013697e-07, - "loss": 0.1613, - "num_tokens": 80956115.0, - "reward": 0.974609375, - "reward_std": 0.3216649293899536, - "rewards/accuracy_reward/mean": 0.138671875, - "rewards/accuracy_reward/std": 0.34594178199768066, + "grad_norm": 0.11413634568452835, + "kl": 0.0010786056518554688, + "learning_rate": 3.6177474402730373e-07, + "loss": 0.0909, + "num_tokens": 100339775.0, + "reward": 0.517578125, + "reward_std": 0.20219513773918152, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8359375, - "rewards/tag_count_reward/std": 0.26942625641822815, + "rewards/tag_count_reward/mean": 0.447265625, + "rewards/tag_count_reward/std": 0.1919422447681427, "step": 107 }, { @@ -3118,27 +3118,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.236328125, + "completions/clipped_ratio": 0.57421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1194.708984375, - "completions/mean_terminated_length": 930.6470336914062, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1703.119140625, + "completions/mean_terminated_length": 1238.0045166015625, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, "epoch": 0.036869505846206366, - "grad_norm": 0.12126666307449341, - "kl": 0.016265869140625, - "learning_rate": 3.664383561643836e-07, - "loss": 0.2176, - "num_tokens": 81649806.0, - "reward": 0.8544921875, - "reward_std": 0.3638536334037781, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, + "grad_norm": 0.09202966839075089, + "kl": 0.0010786056518554688, + "learning_rate": 3.6518771331058016e-07, + "loss": 0.0904, + "num_tokens": 101293772.0, + "reward": 0.44873046875, + "reward_std": 0.19328656792640686, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7509765625, - "rewards/tag_count_reward/std": 0.30926498770713806, + "rewards/tag_count_reward/mean": 0.39794921875, + "rewards/tag_count_reward/std": 0.1809430867433548, "step": 108 }, { @@ -3147,27 +3147,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.220703125, + "completions/clipped_ratio": 0.4453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1093.07421875, - "completions/mean_terminated_length": 822.631591796875, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1530.89453125, + "completions/mean_terminated_length": 1115.7535400390625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, "epoch": 0.037210890159597165, - "grad_norm": 0.13973434269428253, - "kl": 0.019317626953125, - "learning_rate": 3.698630136986301e-07, - "loss": 0.1959, - "num_tokens": 82281076.0, - "reward": 0.90283203125, - "reward_std": 0.33090001344680786, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, + "grad_norm": 0.11212392151355743, + "kl": 0.0012645721435546875, + "learning_rate": 3.6860068259385664e-07, + "loss": 0.1122, + "num_tokens": 102149206.0, + "reward": 0.52880859375, + "reward_std": 0.2463226020336151, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.78759765625, - "rewards/tag_count_reward/std": 0.3059704005718231, + "rewards/tag_count_reward/mean": 0.42919921875, + "rewards/tag_count_reward/std": 0.180350661277771, "step": 109 }, { @@ -3176,27 +3176,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.166015625, + "completions/clipped_ratio": 0.42578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1078.703125, - "completions/mean_terminated_length": 885.751708984375, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1545.33984375, + "completions/mean_terminated_length": 1172.6190185546875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, "epoch": 0.037552274472987965, - "grad_norm": 0.13122792541980743, - "kl": 0.0200347900390625, - "learning_rate": 3.7328767123287667e-07, - "loss": 0.1871, - "num_tokens": 82908444.0, - "reward": 0.9423828125, - "reward_std": 0.3743685483932495, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, + "grad_norm": 0.11292169243097305, + "kl": 0.0011749267578125, + "learning_rate": 3.7201365187713307e-07, + "loss": 0.1128, + "num_tokens": 103015492.0, + "reward": 0.55322265625, + "reward_std": 0.260453462600708, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8115234375, - "rewards/tag_count_reward/std": 0.28732866048812866, + "rewards/tag_count_reward/mean": 0.44580078125, + "rewards/tag_count_reward/std": 0.19248844683170319, "step": 110 }, { @@ -3205,27 +3205,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.205078125, + "completions/clipped_ratio": 0.509765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 1085.521484375, - "completions/mean_terminated_length": 837.2161865234375, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1628.490234375, + "completions/mean_terminated_length": 1192.2669677734375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, "epoch": 0.037893658786378764, - "grad_norm": 0.1464829444885254, - "kl": 0.01904296875, - "learning_rate": 3.767123287671233e-07, - "loss": 0.2646, - "num_tokens": 83541111.0, - "reward": 0.896484375, - "reward_std": 0.37763655185699463, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, + "grad_norm": 0.10947854816913605, + "kl": 0.0013942718505859375, + "learning_rate": 3.754266211604095e-07, + "loss": 0.1225, + "num_tokens": 103926159.0, + "reward": 0.47900390625, + "reward_std": 0.2573707401752472, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.79296875, - "rewards/tag_count_reward/std": 0.29898616671562195, + "rewards/tag_count_reward/mean": 0.42041015625, + "rewards/tag_count_reward/std": 0.19879388809204102, "step": 111 }, { @@ -3234,27 +3234,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.09375, + "completions/clipped_ratio": 0.408203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 953.876953125, - "completions/mean_terminated_length": 840.6918334960938, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1493.845703125, + "completions/mean_terminated_length": 1111.6072998046875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, "epoch": 0.038235043099769564, - "grad_norm": 0.15541662275791168, - "kl": 0.01910400390625, - "learning_rate": 3.8013698630136986e-07, - "loss": 0.1899, - "num_tokens": 84104424.0, - "reward": 1.0166015625, - "reward_std": 0.33080601692199707, - "rewards/accuracy_reward/mean": 0.1484375, - "rewards/accuracy_reward/std": 0.35588082671165466, + "grad_norm": 0.11999060213565826, + "kl": 0.0016345977783203125, + "learning_rate": 3.78839590443686e-07, + "loss": 0.1302, + "num_tokens": 104765936.0, + "reward": 0.5302734375, + "reward_std": 0.25676479935646057, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8681640625, - "rewards/tag_count_reward/std": 0.24320943653583527, + "rewards/tag_count_reward/mean": 0.4482421875, + "rewards/tag_count_reward/std": 0.192208394408226, "step": 112 }, { @@ -3263,27 +3263,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.169921875, + "completions/clipped_ratio": 0.44921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1035.65234375, - "completions/mean_terminated_length": 828.4187622070312, - "completions/min_length": 82.0, - "completions/min_terminated_length": 82.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1497.994140625, + "completions/mean_terminated_length": 1049.40771484375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.038576427413160363, - "grad_norm": 0.16102302074432373, - "kl": 0.0198974609375, - "learning_rate": 3.835616438356164e-07, - "loss": 0.1851, - "num_tokens": 84708902.0, - "reward": 0.96240234375, - "reward_std": 0.33114585280418396, - "rewards/accuracy_reward/mean": 0.15120968222618103, - "rewards/accuracy_reward/std": 0.35861483216285706, + "grad_norm": 0.12736926972866058, + "kl": 0.0014085769653320312, + "learning_rate": 3.8225255972696247e-07, + "loss": 0.1189, + "num_tokens": 105607133.0, + "reward": 0.54296875, + "reward_std": 0.21889027953147888, + "rewards/accuracy_reward/mean": 0.09072580933570862, + "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.81591796875, - "rewards/tag_count_reward/std": 0.28312888741493225, + "rewards/tag_count_reward/mean": 0.455078125, + "rewards/tag_count_reward/std": 0.20968151092529297, "step": 113 }, { @@ -3292,27 +3292,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.126953125, + "completions/clipped_ratio": 0.34765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 996.826171875, - "completions/mean_terminated_length": 843.970947265625, - "completions/min_length": 23.0, - "completions/min_terminated_length": 23.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1416.224609375, + "completions/mean_terminated_length": 1079.530029296875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.03891781172655116, - "grad_norm": 0.15508544445037842, - "kl": 0.017730712890625, - "learning_rate": 3.86986301369863e-07, - "loss": 0.1869, - "num_tokens": 85298157.0, - "reward": 0.970703125, - "reward_std": 0.29038703441619873, - "rewards/accuracy_reward/mean": 0.138671875, - "rewards/accuracy_reward/std": 0.34594178199768066, + "grad_norm": 0.1168883740901947, + "kl": 0.0016460418701171875, + "learning_rate": 3.856655290102389e-07, + "loss": 0.142, + "num_tokens": 106411120.0, + "reward": 0.576171875, + "reward_std": 0.24163921177387238, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83203125, - "rewards/tag_count_reward/std": 0.27064353227615356, + "rewards/tag_count_reward/mean": 0.482421875, + "rewards/tag_count_reward/std": 0.208511620759964, "step": 114 }, { @@ -3321,27 +3321,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.173828125, + "completions/clipped_ratio": 0.501953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1132.140625, - "completions/mean_terminated_length": 939.4420776367188, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1637.53515625, + "completions/mean_terminated_length": 1223.85107421875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.03925919603994196, - "grad_norm": 0.13651324808597565, - "kl": 0.0171051025390625, - "learning_rate": 3.9041095890410957e-07, - "loss": 0.1944, - "num_tokens": 85946037.0, - "reward": 1.00927734375, - "reward_std": 0.38964664936065674, - "rewards/accuracy_reward/mean": 0.1953125, - "rewards/accuracy_reward/std": 0.3968288004398346, + "grad_norm": 0.0987587496638298, + "kl": 0.00171661376953125, + "learning_rate": 3.8907849829351533e-07, + "loss": 0.108, + "num_tokens": 107317762.0, + "reward": 0.56591796875, + "reward_std": 0.28088486194610596, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.81396484375, - "rewards/tag_count_reward/std": 0.28271347284317017, + "rewards/tag_count_reward/mean": 0.43701171875, + "rewards/tag_count_reward/std": 0.20526880025863647, "step": 115 }, { @@ -3350,27 +3350,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.212890625, + "completions/clipped_ratio": 0.462890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1104.9609375, - "completions/mean_terminated_length": 849.895751953125, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1551.1953125, + "completions/mean_terminated_length": 1123.0400390625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, "epoch": 0.03960058035333276, - "grad_norm": 0.5017273426055908, - "kl": 0.0185699462890625, - "learning_rate": 3.938356164383562e-07, - "loss": 0.2407, - "num_tokens": 86581969.0, - "reward": 0.92626953125, - "reward_std": 0.3624322712421417, - "rewards/accuracy_reward/mean": 0.12109375, - "rewards/accuracy_reward/std": 0.3265552520751953, + "grad_norm": 0.10738595575094223, + "kl": 0.0013828277587890625, + "learning_rate": 3.924914675767918e-07, + "loss": 0.114, + "num_tokens": 108182166.0, + "reward": 0.486328125, + "reward_std": 0.2166741043329239, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80517578125, - "rewards/tag_count_reward/std": 0.3020762503147125, + "rewards/tag_count_reward/mean": 0.4453125, + "rewards/tag_count_reward/std": 0.21657694876194, "step": 116 }, { @@ -3379,27 +3379,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.115234375, + "completions/clipped_ratio": 0.3359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 922.08984375, - "completions/mean_terminated_length": 775.4481201171875, - "completions/min_length": 26.0, - "completions/min_terminated_length": 26.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1364.5, + "completions/mean_terminated_length": 1018.7294311523438, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, "epoch": 0.03994196466672356, - "grad_norm": 0.17425385117530823, - "kl": 0.019683837890625, - "learning_rate": 3.972602739726027e-07, - "loss": 0.2106, - "num_tokens": 87127343.0, - "reward": 0.97802734375, - "reward_std": 0.33765822649002075, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, + "grad_norm": 0.1349182277917862, + "kl": 0.001888275146484375, + "learning_rate": 3.9590443686006824e-07, + "loss": 0.1143, + "num_tokens": 108954054.0, + "reward": 0.57958984375, + "reward_std": 0.28033432364463806, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85888671875, - "rewards/tag_count_reward/std": 0.25361213088035583, + "rewards/tag_count_reward/mean": 0.49560546875, + "rewards/tag_count_reward/std": 0.2180800884962082, "step": 117 }, { @@ -3408,27 +3408,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.18359375, + "completions/clipped_ratio": 0.396484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 1080.203125, - "completions/mean_terminated_length": 862.5645751953125, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1479.646484375, + "completions/mean_terminated_length": 1106.26220703125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.04028334898011436, - "grad_norm": 0.15133266150951385, - "kl": 0.019561767578125, - "learning_rate": 4.006849315068493e-07, - "loss": 0.1964, - "num_tokens": 87759863.0, - "reward": 0.953125, - "reward_std": 0.3536233603954315, - "rewards/accuracy_reward/mean": 0.142578125, - "rewards/accuracy_reward/std": 0.3499840497970581, + "grad_norm": 0.11222651600837708, + "kl": 0.002185821533203125, + "learning_rate": 3.993174061433447e-07, + "loss": 0.1221, + "num_tokens": 109791089.0, + "reward": 0.56103515625, + "reward_std": 0.27152127027511597, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.810546875, - "rewards/tag_count_reward/std": 0.294263631105423, + "rewards/tag_count_reward/mean": 0.46337890625, + "rewards/tag_count_reward/std": 0.2133089154958725, "step": 118 }, { @@ -3437,27 +3437,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.26953125, + "completions/clipped_ratio": 0.556640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 1211.7890625, - "completions/mean_terminated_length": 903.2406616210938, - "completions/min_length": 80.0, - "completions/min_terminated_length": 80.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1674.65625, + "completions/mean_terminated_length": 1205.920654296875, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, "epoch": 0.04062473329350516, - "grad_norm": 0.1288234293460846, - "kl": 0.018951416015625, - "learning_rate": 4.041095890410959e-07, - "loss": 0.2069, - "num_tokens": 88465835.0, - "reward": 0.9287109375, - "reward_std": 0.4196006655693054, - "rewards/accuracy_reward/mean": 0.154296875, - "rewards/accuracy_reward/std": 0.36158639192581177, + "grad_norm": 0.10628098249435425, + "kl": 0.0018215179443359375, + "learning_rate": 4.0273037542662116e-07, + "loss": 0.1282, + "num_tokens": 110734049.0, + "reward": 0.5224609375, + "reward_std": 0.24728238582611084, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7744140625, - "rewards/tag_count_reward/std": 0.3075050413608551, + "rewards/tag_count_reward/mean": 0.4150390625, + "rewards/tag_count_reward/std": 0.20177392661571503, "step": 119 }, { @@ -3466,27 +3466,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1875, + "completions/clipped_ratio": 0.419921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1017.51953125, - "completions/mean_terminated_length": 779.7163696289062, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1512.263671875, + "completions/mean_terminated_length": 1124.4410400390625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.04096611760689596, - "grad_norm": 0.1571938693523407, - "kl": 0.02044677734375, - "learning_rate": 4.0753424657534246e-07, - "loss": 0.258, - "num_tokens": 89057525.0, - "reward": 0.978515625, - "reward_std": 0.38398873805999756, - "rewards/accuracy_reward/mean": 0.17578125, - "rewards/accuracy_reward/std": 0.3810062110424042, + "grad_norm": 0.10846513509750366, + "kl": 0.00201416015625, + "learning_rate": 4.061433447098976e-07, + "loss": 0.1341, + "num_tokens": 111579048.0, + "reward": 0.580078125, + "reward_std": 0.291300892829895, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.802734375, - "rewards/tag_count_reward/std": 0.29369157552719116, + "rewards/tag_count_reward/mean": 0.4609375, + "rewards/tag_count_reward/std": 0.21373461186885834, "step": 120 }, { @@ -3495,27 +3495,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.171875, + "completions/clipped_ratio": 0.486328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 1028.244140625, - "completions/mean_terminated_length": 816.5967407226562, - "completions/min_length": 85.0, - "completions/min_terminated_length": 85.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1572.888671875, + "completions/mean_terminated_length": 1123.0684814453125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, "epoch": 0.04130750192028676, - "grad_norm": 0.1524491012096405, - "kl": 0.020660400390625, - "learning_rate": 4.10958904109589e-07, - "loss": 0.2634, - "num_tokens": 89660834.0, - "reward": 0.94384765625, - "reward_std": 0.32741492986679077, - "rewards/accuracy_reward/mean": 0.1270161271095276, - "rewards/accuracy_reward/std": 0.3333272337913513, + "grad_norm": 0.10465190559625626, + "kl": 0.0020465850830078125, + "learning_rate": 4.09556313993174e-07, + "loss": 0.1174, + "num_tokens": 112461215.0, + "reward": 0.5166015625, + "reward_std": 0.24055497348308563, + "rewards/accuracy_reward/mean": 0.08064515888690948, + "rewards/accuracy_reward/std": 0.2725643217563629, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.82080078125, - "rewards/tag_count_reward/std": 0.285394549369812, + "rewards/tag_count_reward/mean": 0.4384765625, + "rewards/tag_count_reward/std": 0.21643351018428802, "step": 121 }, { @@ -3524,27 +3524,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.197265625, + "completions/clipped_ratio": 0.5078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1129.1484375, - "completions/mean_terminated_length": 903.347900390625, - "completions/min_length": 87.0, - "completions/min_terminated_length": 87.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1602.263671875, + "completions/mean_terminated_length": 1142.3770751953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, "epoch": 0.04164888623367756, - "grad_norm": 0.14967991411685944, - "kl": 0.0193634033203125, - "learning_rate": 4.143835616438356e-07, - "loss": 0.2399, - "num_tokens": 90312590.0, - "reward": 0.9462890625, - "reward_std": 0.3499157726764679, - "rewards/accuracy_reward/mean": 0.14453125, - "rewards/accuracy_reward/std": 0.35197147727012634, + "grad_norm": 0.11351174116134644, + "kl": 0.0019683837890625, + "learning_rate": 4.1296928327645045e-07, + "loss": 0.1353, + "num_tokens": 113355206.0, + "reward": 0.5439453125, + "reward_std": 0.25091999769210815, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8017578125, - "rewards/tag_count_reward/std": 0.2967647314071655, + "rewards/tag_count_reward/mean": 0.4345703125, + "rewards/tag_count_reward/std": 0.2095242291688919, "step": 122 }, { @@ -3553,27 +3553,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.16796875, + "completions/clipped_ratio": 0.4296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1017.66796875, - "completions/mean_terminated_length": 809.6666870117188, - "completions/min_length": 35.0, - "completions/min_terminated_length": 35.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1469.9765625, + "completions/mean_terminated_length": 1034.4794921875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.04199027054706836, - "grad_norm": 0.18192212283611298, - "kl": 0.02178955078125, - "learning_rate": 4.1780821917808217e-07, - "loss": 0.1701, - "num_tokens": 90911492.0, - "reward": 0.9931640625, - "reward_std": 0.38202834129333496, - "rewards/accuracy_reward/mean": 0.16796875, - "rewards/accuracy_reward/std": 0.374204158782959, + "grad_norm": 0.11196348071098328, + "kl": 0.002208709716796875, + "learning_rate": 4.1638225255972693e-07, + "loss": 0.1005, + "num_tokens": 114185690.0, + "reward": 0.59912109375, + "reward_std": 0.2856624722480774, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8251953125, - "rewards/tag_count_reward/std": 0.2775168716907501, + "rewards/tag_count_reward/mean": 0.47802734375, + "rewards/tag_count_reward/std": 0.23173095285892487, "step": 123 }, { @@ -3582,27 +3582,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1328125, + "completions/clipped_ratio": 0.357421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 939.9609375, - "completions/mean_terminated_length": 770.2612915039062, - "completions/min_length": 43.0, - "completions/min_terminated_length": 43.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1376.5625, + "completions/mean_terminated_length": 1003.088134765625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.04233165486045916, - "grad_norm": 0.1643485575914383, - "kl": 0.022003173828125, - "learning_rate": 4.212328767123288e-07, - "loss": 0.1862, - "num_tokens": 91465520.0, - "reward": 1.048828125, - "reward_std": 0.3515176773071289, - "rewards/accuracy_reward/mean": 0.17578125, - "rewards/accuracy_reward/std": 0.3810062110424042, + "grad_norm": 0.1199040487408638, + "kl": 0.0027313232421875, + "learning_rate": 4.1979522184300336e-07, + "loss": 0.1037, + "num_tokens": 114963258.0, + "reward": 0.63427734375, + "reward_std": 0.2894349694252014, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.873046875, - "rewards/tag_count_reward/std": 0.2497476041316986, + "rewards/tag_count_reward/mean": 0.49560546875, + "rewards/tag_count_reward/std": 0.234823539853096, "step": 124 }, { @@ -3611,27 +3611,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.154296875, + "completions/clipped_ratio": 0.400390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 944.138671875, - "completions/mean_terminated_length": 742.7413330078125, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1441.24609375, + "completions/mean_terminated_length": 1036.0845947265625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, "epoch": 0.04267303917384996, - "grad_norm": 0.41532760858535767, - "kl": 0.027557373046875, - "learning_rate": 4.246575342465753e-07, - "loss": 0.2359, - "num_tokens": 92035719.0, - "reward": 0.9375, - "reward_std": 0.30874431133270264, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, + "grad_norm": 0.13748203217983246, + "kl": 0.003017425537109375, + "learning_rate": 4.2320819112627985e-07, + "loss": 0.1128, + "num_tokens": 115787976.0, + "reward": 0.53466796875, + "reward_std": 0.2665495276451111, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.841796875, - "rewards/tag_count_reward/std": 0.27380964159965515, + "rewards/tag_count_reward/mean": 0.46630859375, + "rewards/tag_count_reward/std": 0.21663375198841095, "step": 125 }, { @@ -3640,27 +3640,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.177734375, + "completions/clipped_ratio": 0.474609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1018.701171875, - "completions/mean_terminated_length": 796.2161865234375, - "completions/min_length": 85.0, - "completions/min_terminated_length": 85.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1526.8828125, + "completions/mean_terminated_length": 1056.1337890625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.043014423487240765, - "grad_norm": 0.3897092640399933, - "kl": 0.0355224609375, - "learning_rate": 4.280821917808219e-07, - "loss": 0.2392, - "num_tokens": 92637870.0, - "reward": 0.93115234375, - "reward_std": 0.3366728723049164, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, + "grad_norm": 0.1286364197731018, + "kl": 0.0027313232421875, + "learning_rate": 4.2662116040955633e-07, + "loss": 0.13, + "num_tokens": 116650316.0, + "reward": 0.5517578125, + "reward_std": 0.2630772590637207, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.82763671875, - "rewards/tag_count_reward/std": 0.284898579120636, + "rewards/tag_count_reward/mean": 0.4638671875, + "rewards/tag_count_reward/std": 0.24212263524532318, "step": 126 }, { @@ -3669,27 +3669,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.150390625, + "completions/clipped_ratio": 0.314453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 987.669921875, - "completions/mean_terminated_length": 799.9793090820312, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1331.77734375, + "completions/mean_terminated_length": 1003.2535400390625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, "epoch": 0.043355807800631564, - "grad_norm": 0.133542001247406, - "kl": 0.0201416015625, - "learning_rate": 4.315068493150685e-07, - "loss": 0.2343, - "num_tokens": 93214229.0, - "reward": 0.93701171875, - "reward_std": 0.3180956542491913, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, + "grad_norm": 0.12783904373645782, + "kl": 0.00225067138671875, + "learning_rate": 4.3003412969283276e-07, + "loss": 0.1329, + "num_tokens": 117402858.0, + "reward": 0.59228515625, + "reward_std": 0.2673385739326477, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83740234375, - "rewards/tag_count_reward/std": 0.2776799499988556, + "rewards/tag_count_reward/mean": 0.52978515625, + "rewards/tag_count_reward/std": 0.25017908215522766, "step": 127 }, { @@ -3698,27 +3698,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.158203125, + "completions/clipped_ratio": 0.41015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 992.728515625, - "completions/mean_terminated_length": 794.406005859375, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1465.98046875, + "completions/mean_terminated_length": 1061.264892578125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.043697192114022364, - "grad_norm": 0.155024453997612, - "kl": 0.02215576171875, - "learning_rate": 4.3493150684931507e-07, - "loss": 0.2559, - "num_tokens": 93803706.0, - "reward": 0.92724609375, - "reward_std": 0.3279414772987366, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, + "grad_norm": 0.12041183561086655, + "kl": 0.0027179718017578125, + "learning_rate": 4.334470989761092e-07, + "loss": 0.1546, + "num_tokens": 118234640.0, + "reward": 0.5439453125, + "reward_std": 0.253177285194397, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.82373046875, - "rewards/tag_count_reward/std": 0.2820604145526886, + "rewards/tag_count_reward/mean": 0.4619140625, + "rewards/tag_count_reward/std": 0.21899667382240295, "step": 128 }, { @@ -3727,27 +3727,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.130859375, + "completions/clipped_ratio": 0.4140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1000.693359375, - "completions/mean_terminated_length": 843.0089721679688, - "completions/min_length": 77.0, - "completions/min_terminated_length": 77.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1468.15234375, + "completions/mean_terminated_length": 1058.393310546875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.04403857642741316, - "grad_norm": 0.14573651552200317, - "kl": 0.021942138671875, - "learning_rate": 4.383561643835616e-07, - "loss": 0.2046, - "num_tokens": 94396829.0, - "reward": 0.97998046875, - "reward_std": 0.33831119537353516, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, + "grad_norm": 0.11871069669723511, + "kl": 0.002471923828125, + "learning_rate": 4.3686006825938567e-07, + "loss": 0.1482, + "num_tokens": 119067102.0, + "reward": 0.60205078125, + "reward_std": 0.2857625484466553, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83935546875, - "rewards/tag_count_reward/std": 0.27035340666770935, + "rewards/tag_count_reward/mean": 0.47509765625, + "rewards/tag_count_reward/std": 0.22554562985897064, "step": 129 }, { @@ -3756,27 +3756,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.126953125, + "completions/clipped_ratio": 0.353515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1026.716796875, - "completions/mean_terminated_length": 878.2080688476562, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1398.197265625, + "completions/mean_terminated_length": 1042.8670654296875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, "epoch": 0.04437996074080396, - "grad_norm": 0.13692845404148102, - "kl": 0.02105712890625, - "learning_rate": 4.417808219178082e-07, - "loss": 0.1291, - "num_tokens": 95002332.0, - "reward": 0.97998046875, - "reward_std": 0.3523958921432495, - "rewards/accuracy_reward/mean": 0.138671875, - "rewards/accuracy_reward/std": 0.34594178199768066, + "grad_norm": 0.12083795666694641, + "kl": 0.002685546875, + "learning_rate": 4.402730375426621e-07, + "loss": 0.109, + "num_tokens": 119862803.0, + "reward": 0.64306640625, + "reward_std": 0.3147738575935364, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84130859375, - "rewards/tag_count_reward/std": 0.26512494683265686, + "rewards/tag_count_reward/mean": 0.50634765625, + "rewards/tag_count_reward/std": 0.2368534356355667, "step": 130 }, { @@ -3785,27 +3785,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2890625, + "completions/clipped_ratio": 0.49609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1252.416015625, - "completions/mean_terminated_length": 928.9368286132812, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1609.705078125, + "completions/mean_terminated_length": 1178.2054443359375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.04472134505419476, - "grad_norm": 0.12116413563489914, - "kl": 0.021484375, - "learning_rate": 4.4520547945205477e-07, - "loss": 0.1845, - "num_tokens": 95722913.0, - "reward": 0.84814453125, - "reward_std": 0.3764338493347168, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, + "grad_norm": 0.10432874411344528, + "kl": 0.00284576416015625, + "learning_rate": 4.4368600682593853e-07, + "loss": 0.1089, + "num_tokens": 120766316.0, + "reward": 0.55419921875, + "reward_std": 0.3031613230705261, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.71923828125, - "rewards/tag_count_reward/std": 0.32793310284614563, + "rewards/tag_count_reward/mean": 0.46435546875, + "rewards/tag_count_reward/std": 0.2459535002708435, "step": 131 }, { @@ -3814,27 +3814,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.255859375, + "completions/clipped_ratio": 0.54296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1201.01953125, - "completions/mean_terminated_length": 909.800537109375, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1637.5078125, + "completions/mean_terminated_length": 1149.8291015625, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, "epoch": 0.04506272936758556, - "grad_norm": 0.1327248513698578, - "kl": 0.02069091796875, - "learning_rate": 4.4863013698630134e-07, - "loss": 0.1875, - "num_tokens": 96425483.0, - "reward": 0.89013671875, - "reward_std": 0.3610924482345581, - "rewards/accuracy_reward/mean": 0.150390625, - "rewards/accuracy_reward/std": 0.35780346393585205, + "grad_norm": 0.10717989504337311, + "kl": 0.00232696533203125, + "learning_rate": 4.4709897610921496e-07, + "loss": 0.1285, + "num_tokens": 121692368.0, + "reward": 0.5263671875, + "reward_std": 0.2632749676704407, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.73974609375, - "rewards/tag_count_reward/std": 0.3116576075553894, + "rewards/tag_count_reward/mean": 0.4482421875, + "rewards/tag_count_reward/std": 0.23409658670425415, "step": 132 }, { @@ -3843,27 +3843,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.216796875, + "completions/clipped_ratio": 0.431640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1162.76171875, - "completions/mean_terminated_length": 917.720703125, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1526.857421875, + "completions/mean_terminated_length": 1131.0755615234375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.04540411368097636, - "grad_norm": 0.14228351414203644, - "kl": 0.021026611328125, - "learning_rate": 4.520547945205479e-07, - "loss": 0.1876, - "num_tokens": 97095233.0, - "reward": 0.861328125, - "reward_std": 0.33468449115753174, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, + "grad_norm": 0.12041887640953064, + "kl": 0.002719879150390625, + "learning_rate": 4.5051194539249145e-07, + "loss": 0.1176, + "num_tokens": 122548535.0, + "reward": 0.53564453125, + "reward_std": 0.26876211166381836, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.783203125, - "rewards/tag_count_reward/std": 0.3002299964427948, + "rewards/tag_count_reward/mean": 0.47705078125, + "rewards/tag_count_reward/std": 0.2273726761341095, "step": 133 }, { @@ -3872,27 +3872,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.21484375, + "completions/clipped_ratio": 0.357421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 1092.31640625, - "completions/mean_terminated_length": 830.8109130859375, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1395.3515625, + "completions/mean_terminated_length": 1032.3282470703125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.04574549799436716, - "grad_norm": 0.14528445899486542, - "kl": 0.02288818359375, - "learning_rate": 4.554794520547945e-07, - "loss": 0.2004, - "num_tokens": 97728499.0, - "reward": 0.89990234375, - "reward_std": 0.3530845642089844, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, + "grad_norm": 0.11840149015188217, + "kl": 0.0031280517578125, + "learning_rate": 4.539249146757679e-07, + "loss": 0.1338, + "num_tokens": 123336955.0, + "reward": 0.630859375, + "reward_std": 0.2869810461997986, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.78466796875, - "rewards/tag_count_reward/std": 0.30431386828422546, + "rewards/tag_count_reward/mean": 0.53515625, + "rewards/tag_count_reward/std": 0.2653955817222595, "step": 134 }, { @@ -3901,27 +3901,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.17578125, + "completions/clipped_ratio": 0.431640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1047.08984375, - "completions/mean_terminated_length": 833.6256103515625, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1452.38671875, + "completions/mean_terminated_length": 1000.048095703125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.04608688230775796, - "grad_norm": 0.1438317745923996, - "kl": 0.022674560546875, - "learning_rate": 4.589041095890411e-07, - "loss": 0.2098, - "num_tokens": 98339809.0, - "reward": 1.0126953125, - "reward_std": 0.39924299716949463, - "rewards/accuracy_reward/mean": 0.21875, - "rewards/accuracy_reward/std": 0.41380295157432556, + "grad_norm": 0.12020183354616165, + "kl": 0.003162384033203125, + "learning_rate": 4.573378839590443e-07, + "loss": 0.1277, + "num_tokens": 124155777.0, + "reward": 0.6162109375, + "reward_std": 0.3152843117713928, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7939453125, - "rewards/tag_count_reward/std": 0.2918033301830292, + "rewards/tag_count_reward/mean": 0.4853515625, + "rewards/tag_count_reward/std": 0.24235931038856506, "step": 135 }, { @@ -3930,27 +3930,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.169921875, + "completions/clipped_ratio": 0.404296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1103.76953125, - "completions/mean_terminated_length": 910.47998046875, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1493.4375, + "completions/mean_terminated_length": 1117.062255859375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.04642826662114876, - "grad_norm": 0.13441050052642822, - "kl": 0.02557373046875, - "learning_rate": 4.6232876712328767e-07, - "loss": 0.2169, - "num_tokens": 98983547.0, - "reward": 0.9375, - "reward_std": 0.33622604608535767, - "rewards/accuracy_reward/mean": 0.1270161271095276, - "rewards/accuracy_reward/std": 0.33332720398902893, + "grad_norm": 0.11123418807983398, + "kl": 0.003223419189453125, + "learning_rate": 4.6075085324232084e-07, + "loss": 0.1297, + "num_tokens": 124999025.0, + "reward": 0.5966796875, + "reward_std": 0.2818424701690674, + "rewards/accuracy_reward/mean": 0.12298387289047241, + "rewards/accuracy_reward/std": 0.32875028252601624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.814453125, - "rewards/tag_count_reward/std": 0.2854016423225403, + "rewards/tag_count_reward/mean": 0.4775390625, + "rewards/tag_count_reward/std": 0.22982890903949738, "step": 136 }, { @@ -3959,27 +3959,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.248046875, + "completions/clipped_ratio": 0.486328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 1172.1171875, - "completions/mean_terminated_length": 883.1895751953125, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1566.46875, + "completions/mean_terminated_length": 1110.5703125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, "epoch": 0.04676965093453956, - "grad_norm": 0.1310551017522812, - "kl": 0.021697998046875, - "learning_rate": 4.657534246575342e-07, - "loss": 0.1996, - "num_tokens": 99669991.0, - "reward": 0.8798828125, - "reward_std": 0.346016526222229, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, + "grad_norm": 0.10627542436122894, + "kl": 0.00283050537109375, + "learning_rate": 4.641638225255973e-07, + "loss": 0.142, + "num_tokens": 125887377.0, + "reward": 0.544921875, + "reward_std": 0.2709546983242035, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7568359375, - "rewards/tag_count_reward/std": 0.3158440887928009, + "rewards/tag_count_reward/mean": 0.474609375, + "rewards/tag_count_reward/std": 0.25042009353637695, "step": 137 }, { @@ -3988,27 +3988,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.21484375, + "completions/clipped_ratio": 0.38671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1104.271484375, - "completions/mean_terminated_length": 846.0372924804688, - "completions/min_length": 63.0, - "completions/min_terminated_length": 63.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1407.01953125, + "completions/mean_terminated_length": 1002.8344116210938, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.04711103524793036, - "grad_norm": 0.13153010606765747, - "kl": 0.0242919921875, - "learning_rate": 4.691780821917808e-07, - "loss": 0.2479, - "num_tokens": 100306738.0, - "reward": 0.8525390625, - "reward_std": 0.33619534969329834, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, + "grad_norm": 0.12202514708042145, + "kl": 0.00362396240234375, + "learning_rate": 4.675767918088737e-07, + "loss": 0.1502, + "num_tokens": 126679131.0, + "reward": 0.57666015625, + "reward_std": 0.2793852388858795, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7724609375, - "rewards/tag_count_reward/std": 0.3032495379447937, + "rewards/tag_count_reward/mean": 0.49072265625, + "rewards/tag_count_reward/std": 0.24237261712551117, "step": 138 }, { @@ -4017,27 +4017,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2265625, + "completions/clipped_ratio": 0.4921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 1173.05859375, - "completions/mean_terminated_length": 916.7626342773438, - "completions/min_length": 60.0, - "completions/min_terminated_length": 60.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1547.798828125, + "completions/mean_terminated_length": 1062.9884033203125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.04745241956132116, - "grad_norm": 0.15235306322574615, - "kl": 0.0244140625, - "learning_rate": 4.726027397260274e-07, - "loss": 0.2333, - "num_tokens": 100988768.0, - "reward": 0.89306640625, - "reward_std": 0.33702439069747925, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, + "grad_norm": 0.10932815074920654, + "kl": 0.003513336181640625, + "learning_rate": 4.709897610921502e-07, + "loss": 0.1659, + "num_tokens": 127553028.0, + "reward": 0.5849609375, + "reward_std": 0.2855178117752075, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.77783203125, - "rewards/tag_count_reward/std": 0.30381107330322266, + "rewards/tag_count_reward/mean": 0.5029296875, + "rewards/tag_count_reward/std": 0.2677024304866791, "step": 139 }, { @@ -4046,27 +4046,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.19921875, + "completions/clipped_ratio": 0.447265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1096.26953125, - "completions/mean_terminated_length": 859.49755859375, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1503.876953125, + "completions/mean_terminated_length": 1063.5794677734375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.04779380387471196, - "grad_norm": 0.14825449883937836, - "kl": 0.0213623046875, - "learning_rate": 4.7602739726027394e-07, - "loss": 0.1917, - "num_tokens": 101621482.0, - "reward": 0.92822265625, - "reward_std": 0.3136889934539795, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310528099536896, + "grad_norm": 0.11491069197654724, + "kl": 0.003360748291015625, + "learning_rate": 4.744027303754266e-07, + "loss": 0.1303, + "num_tokens": 128394437.0, + "reward": 0.5673828125, + "reward_std": 0.2512876093387604, + "rewards/accuracy_reward/mean": 0.08669354766607285, + "rewards/accuracy_reward/std": 0.281669557094574, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80712890625, - "rewards/tag_count_reward/std": 0.2897177040576935, + "rewards/tag_count_reward/mean": 0.4833984375, + "rewards/tag_count_reward/std": 0.23815946280956268, "step": 140 }, { @@ -4075,27 +4075,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.203125, + "completions/clipped_ratio": 0.37109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 1127.798828125, - "completions/mean_terminated_length": 893.23779296875, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1356.484375, + "completions/mean_terminated_length": 948.4472045898438, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.04813518818810276, - "grad_norm": 0.13674724102020264, - "kl": 0.0216064453125, - "learning_rate": 4.794520547945205e-07, - "loss": 0.2317, - "num_tokens": 102276419.0, - "reward": 0.9462890625, - "reward_std": 0.3898537755012512, - "rewards/accuracy_reward/mean": 0.154296875, - "rewards/accuracy_reward/std": 0.36158639192581177, + "grad_norm": 0.1220201849937439, + "kl": 0.003894805908203125, + "learning_rate": 4.778156996587031e-07, + "loss": 0.1608, + "num_tokens": 129166461.0, + "reward": 0.68994140625, + "reward_std": 0.33712685108184814, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7919921875, - "rewards/tag_count_reward/std": 0.2987160086631775, + "rewards/tag_count_reward/mean": 0.52783203125, + "rewards/tag_count_reward/std": 0.26233145594596863, "step": 141 }, { @@ -4104,27 +4104,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.23828125, + "completions/clipped_ratio": 0.41015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1158.375, - "completions/mean_terminated_length": 880.0820922851562, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1453.32421875, + "completions/mean_terminated_length": 1039.8079833984375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.04847657250149356, - "grad_norm": 0.12955734133720398, - "kl": 0.023406982421875, - "learning_rate": 4.828767123287671e-07, - "loss": 0.2187, - "num_tokens": 102952787.0, - "reward": 0.88818359375, - "reward_std": 0.3411647081375122, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, + "grad_norm": 0.11203473806381226, + "kl": 0.003520965576171875, + "learning_rate": 4.812286689419795e-07, + "loss": 0.1208, + "num_tokens": 129993843.0, + "reward": 0.580078125, + "reward_std": 0.2685844600200653, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.78076171875, - "rewards/tag_count_reward/std": 0.3130495250225067, + "rewards/tag_count_reward/mean": 0.501953125, + "rewards/tag_count_reward/std": 0.2497476041316986, "step": 142 }, { @@ -4133,27 +4133,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.23828125, + "completions/clipped_ratio": 0.384765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 1182.376953125, - "completions/mean_terminated_length": 911.5923461914062, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1444.095703125, + "completions/mean_terminated_length": 1066.4158935546875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.04881795681488436, - "grad_norm": 0.1215982586145401, - "kl": 0.024078369140625, - "learning_rate": 4.863013698630137e-07, - "loss": 0.2179, - "num_tokens": 103631364.0, - "reward": 0.92236328125, - "reward_std": 0.3439157009124756, - "rewards/accuracy_reward/mean": 0.138671875, - "rewards/accuracy_reward/std": 0.34594178199768066, + "grad_norm": 0.10747040063142776, + "kl": 0.00374603271484375, + "learning_rate": 4.84641638225256e-07, + "loss": 0.1554, + "num_tokens": 130806420.0, + "reward": 0.6005859375, + "reward_std": 0.28536751866340637, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.78369140625, - "rewards/tag_count_reward/std": 0.3123558759689331, + "rewards/tag_count_reward/mean": 0.5048828125, + "rewards/tag_count_reward/std": 0.24575771391391754, "step": 143 }, { @@ -4162,27 +4162,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.23046875, + "completions/clipped_ratio": 0.34375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1130.21875, - "completions/mean_terminated_length": 855.3502197265625, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1384.15625, + "completions/mean_terminated_length": 1036.4285888671875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, "epoch": 0.04915934112827516, - "grad_norm": 0.15443135797977448, - "kl": 0.0263671875, - "learning_rate": 4.897260273972603e-07, - "loss": 0.1898, - "num_tokens": 104289572.0, - "reward": 0.94482421875, - "reward_std": 0.36273717880249023, - "rewards/accuracy_reward/mean": 0.154296875, - "rewards/accuracy_reward/std": 0.36158639192581177, + "grad_norm": 0.13118286430835724, + "kl": 0.0048675537109375, + "learning_rate": 4.880546075085323e-07, + "loss": 0.1429, + "num_tokens": 131594644.0, + "reward": 0.63671875, + "reward_std": 0.29929935932159424, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.79052734375, - "rewards/tag_count_reward/std": 0.30479392409324646, + "rewards/tag_count_reward/mean": 0.52734375, + "rewards/tag_count_reward/std": 0.24874316155910492, "step": 144 }, { @@ -4191,27 +4191,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.140625, + "completions/clipped_ratio": 0.251953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 935.13671875, - "completions/mean_terminated_length": 753.0317993164062, - "completions/min_length": 95.0, - "completions/min_terminated_length": 95.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1159.37109375, + "completions/mean_terminated_length": 860.06787109375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, "epoch": 0.049500725441665956, - "grad_norm": 0.16990253329277039, - "kl": 0.03143310546875, - "learning_rate": 4.931506849315068e-07, - "loss": 0.2292, - "num_tokens": 104838666.0, - "reward": 1.03076171875, - "reward_std": 0.35108163952827454, - "rewards/accuracy_reward/mean": 0.19921875, - "rewards/accuracy_reward/std": 0.39980348944664, + "grad_norm": 0.15537595748901367, + "kl": 0.005615234375, + "learning_rate": 4.914675767918088e-07, + "loss": 0.1561, + "num_tokens": 132258546.0, + "reward": 0.7529296875, + "reward_std": 0.3398086130619049, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83154296875, - "rewards/tag_count_reward/std": 0.26829564571380615, + "rewards/tag_count_reward/mean": 0.6083984375, + "rewards/tag_count_reward/std": 0.26397860050201416, "step": 145 }, { @@ -4220,27 +4220,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.197265625, + "completions/clipped_ratio": 0.3984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 1123.224609375, - "completions/mean_terminated_length": 895.9683837890625, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1415.310546875, + "completions/mean_terminated_length": 996.2564697265625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, "epoch": 0.049842109755056756, - "grad_norm": 0.13389046490192413, - "kl": 0.026123046875, - "learning_rate": 4.965753424657534e-07, - "loss": 0.1889, - "num_tokens": 105487933.0, - "reward": 0.92431640625, - "reward_std": 0.3295518159866333, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, + "grad_norm": 0.12384098023176193, + "kl": 0.004154205322265625, + "learning_rate": 4.948805460750853e-07, + "loss": 0.1764, + "num_tokens": 133057361.0, + "reward": 0.60693359375, + "reward_std": 0.3001461923122406, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.79541015625, - "rewards/tag_count_reward/std": 0.2959502041339874, + "rewards/tag_count_reward/mean": 0.51708984375, + "rewards/tag_count_reward/std": 0.2627863883972168, "step": 146 }, { @@ -4249,27 +4249,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.162109375, + "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1083.4921875, - "completions/mean_terminated_length": 896.8858032226562, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1329.32421875, + "completions/mean_terminated_length": 1025.88330078125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, "epoch": 0.050183494068447555, - "grad_norm": 0.12234027683734894, - "kl": 0.026885986328125, - "learning_rate": 5e-07, - "loss": 0.1988, - "num_tokens": 106121801.0, - "reward": 0.97607421875, - "reward_std": 0.3266531229019165, - "rewards/accuracy_reward/mean": 0.1391129046678543, - "rewards/accuracy_reward/std": 0.3464137017726898, + "grad_norm": 0.13346366584300995, + "kl": 0.00533294677734375, + "learning_rate": 4.982935153583617e-07, + "loss": 0.175, + "num_tokens": 133817095.0, + "reward": 0.67529296875, + "reward_std": 0.30715054273605347, + "rewards/accuracy_reward/mean": 0.11088709533214569, + "rewards/accuracy_reward/std": 0.3143092691898346, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84130859375, - "rewards/tag_count_reward/std": 0.27553123235702515, + "rewards/tag_count_reward/mean": 0.56787109375, + "rewards/tag_count_reward/std": 0.2643308639526367, "step": 147 }, { @@ -4278,27 +4278,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.228515625, + "completions/clipped_ratio": 0.412109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 1198.45703125, - "completions/mean_terminated_length": 946.8202514648438, - "completions/min_length": 196.0, - "completions/min_terminated_length": 196.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1452.392578125, + "completions/mean_terminated_length": 1034.873779296875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.050524878381838355, - "grad_norm": 0.12022490799427032, - "kl": 0.024627685546875, - "learning_rate": 5.034246575342465e-07, - "loss": 0.2133, - "num_tokens": 106814163.0, - "reward": 0.9658203125, - "reward_std": 0.3609035015106201, - "rewards/accuracy_reward/mean": 0.1796875, - "rewards/accuracy_reward/std": 0.38430243730545044, + "grad_norm": 0.11251989006996155, + "kl": 0.0041656494140625, + "learning_rate": 5.017064846416383e-07, + "loss": 0.1391, + "num_tokens": 134639472.0, + "reward": 0.64111328125, + "reward_std": 0.31187868118286133, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7861328125, - "rewards/tag_count_reward/std": 0.3063468933105469, + "rewards/tag_count_reward/mean": 0.52978515625, + "rewards/tag_count_reward/std": 0.2744261920452118, "step": 148 }, { @@ -4307,27 +4307,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.27734375, + "completions/clipped_ratio": 0.396484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 1268.662109375, - "completions/mean_terminated_length": 969.5648803710938, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1482.369140625, + "completions/mean_terminated_length": 1110.7735595703125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, "epoch": 0.050866262695229154, - "grad_norm": 0.12187255173921585, - "kl": 0.02496337890625, - "learning_rate": 5.068493150684931e-07, - "loss": 0.2075, - "num_tokens": 107541798.0, - "reward": 0.8916015625, - "reward_std": 0.3265082538127899, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, + "grad_norm": 0.12822338938713074, + "kl": 0.00409698486328125, + "learning_rate": 5.051194539249146e-07, + "loss": 0.1455, + "num_tokens": 135476525.0, + "reward": 0.62353515625, + "reward_std": 0.3073246479034424, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7646484375, - "rewards/tag_count_reward/std": 0.3182791769504547, + "rewards/tag_count_reward/mean": 0.51806640625, + "rewards/tag_count_reward/std": 0.24737520515918732, "step": 149 }, { @@ -4336,27 +4336,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.20703125, + "completions/clipped_ratio": 0.3515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 1077.37890625, - "completions/mean_terminated_length": 823.9655151367188, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1319.892578125, + "completions/mean_terminated_length": 925.135498046875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.051207647008619954, - "grad_norm": 0.12971220910549164, - "kl": 0.03204345703125, - "learning_rate": 5.102739726027398e-07, - "loss": 0.2329, - "num_tokens": 108168616.0, - "reward": 0.9423828125, - "reward_std": 0.34843137860298157, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, + "grad_norm": 0.13123086094856262, + "kl": 0.005870819091796875, + "learning_rate": 5.085324232081911e-07, + "loss": 0.1679, + "num_tokens": 136227510.0, + "reward": 0.63525390625, + "reward_std": 0.2943967878818512, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8232421875, - "rewards/tag_count_reward/std": 0.2913576364517212, + "rewards/tag_count_reward/mean": 0.54150390625, + "rewards/tag_count_reward/std": 0.27199897170066833, "step": 150 }, { @@ -4365,27 +4365,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.224609375, + "completions/clipped_ratio": 0.314453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 1141.125, - "completions/mean_terminated_length": 878.4281616210938, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1344.75, + "completions/mean_terminated_length": 1022.1766357421875, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, "epoch": 0.051549031322010753, - "grad_norm": 0.1362224668264389, - "kl": 0.028289794921875, - "learning_rate": 5.136986301369864e-07, - "loss": 0.2354, - "num_tokens": 108833640.0, - "reward": 0.87890625, - "reward_std": 0.30723893642425537, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, + "grad_norm": 0.13723763823509216, + "kl": 0.00555419921875, + "learning_rate": 5.119453924914675e-07, + "loss": 0.1562, + "num_tokens": 136996790.0, + "reward": 0.650390625, + "reward_std": 0.2970719337463379, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.802734375, - "rewards/tag_count_reward/std": 0.30231064558029175, + "rewards/tag_count_reward/mean": 0.5859375, + "rewards/tag_count_reward/std": 0.2835811972618103, "step": 151 }, { @@ -4394,27 +4394,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.220703125, + "completions/clipped_ratio": 0.357421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 1187.841796875, - "completions/mean_terminated_length": 944.2380981445312, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1424.36328125, + "completions/mean_terminated_length": 1077.4771728515625, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, "epoch": 0.05189041563540155, - "grad_norm": 0.13775372505187988, - "kl": 0.0277099609375, - "learning_rate": 5.171232876712328e-07, - "loss": 0.1825, - "num_tokens": 109526151.0, - "reward": 0.93359375, - "reward_std": 0.33710235357284546, - "rewards/accuracy_reward/mean": 0.12109375, - "rewards/accuracy_reward/std": 0.3265552520751953, + "grad_norm": 0.1149178147315979, + "kl": 0.0048980712890625, + "learning_rate": 5.15358361774744e-07, + "loss": 0.1576, + "num_tokens": 137810400.0, + "reward": 0.68115234375, + "reward_std": 0.32927078008651733, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8125, - "rewards/tag_count_reward/std": 0.2942701280117035, + "rewards/tag_count_reward/mean": 0.57568359375, + "rewards/tag_count_reward/std": 0.270010381937027, "step": 152 }, { @@ -4423,27 +4423,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.173828125, + "completions/clipped_ratio": 0.380859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 1140.255859375, - "completions/mean_terminated_length": 949.2647705078125, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1432.234375, + "completions/mean_terminated_length": 1053.451171875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.05223179994879235, - "grad_norm": 0.12133099883794785, - "kl": 0.025604248046875, - "learning_rate": 5.205479452054794e-07, - "loss": 0.214, - "num_tokens": 110184522.0, - "reward": 0.92578125, - "reward_std": 0.3521662950515747, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, + "grad_norm": 0.11075170338153839, + "kl": 0.00516510009765625, + "learning_rate": 5.187713310580204e-07, + "loss": 0.1756, + "num_tokens": 138618264.0, + "reward": 0.63720703125, + "reward_std": 0.3462282419204712, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.828125, - "rewards/tag_count_reward/std": 0.28325754404067993, + "rewards/tag_count_reward/mean": 0.55126953125, + "rewards/tag_count_reward/std": 0.2774699628353119, "step": 153 }, { @@ -4452,27 +4452,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.177734375, + "completions/clipped_ratio": 0.33984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1113.87890625, - "completions/mean_terminated_length": 911.966796875, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/mean_length": 1356.251953125, + "completions/mean_terminated_length": 1000.1449584960938, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.05257318426218315, - "grad_norm": 0.22379489243030548, - "kl": 0.02886962890625, - "learning_rate": 5.23972602739726e-07, - "loss": 0.1919, - "num_tokens": 110828988.0, - "reward": 0.99609375, - "reward_std": 0.3244497776031494, - "rewards/accuracy_reward/mean": 0.154296875, - "rewards/accuracy_reward/std": 0.36158639192581177, + "grad_norm": 0.1275874823331833, + "kl": 0.006683349609375, + "learning_rate": 5.221843003412969e-07, + "loss": 0.1505, + "num_tokens": 139386825.0, + "reward": 0.70361328125, + "reward_std": 0.3365657329559326, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.841796875, - "rewards/tag_count_reward/std": 0.2755906283855438, + "rewards/tag_count_reward/mean": 0.58642578125, + "rewards/tag_count_reward/std": 0.28707703948020935, "step": 154 }, { @@ -4481,27 +4481,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.166015625, + "completions/clipped_ratio": 0.28515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 1084.51953125, - "completions/mean_terminated_length": 892.7259521484375, - "completions/min_length": 82.0, - "completions/min_terminated_length": 82.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1266.794921875, + "completions/mean_terminated_length": 955.1666259765625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, "epoch": 0.05291456857557395, - "grad_norm": 0.13794507086277008, - "kl": 0.03118896484375, - "learning_rate": 5.273972602739725e-07, - "loss": 0.1815, - "num_tokens": 111451590.0, - "reward": 0.97216796875, - "reward_std": 0.33759379386901855, - "rewards/accuracy_reward/mean": 0.13306452333927155, - "rewards/accuracy_reward/std": 0.3399873375892639, + "grad_norm": 0.1265733540058136, + "kl": 0.0062408447265625, + "learning_rate": 5.255972696245734e-07, + "loss": 0.1762, + "num_tokens": 140102752.0, + "reward": 0.7265625, + "reward_std": 0.34501129388809204, + "rewards/accuracy_reward/mean": 0.11290322244167328, + "rewards/accuracy_reward/std": 0.3167939782142639, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84326171875, - "rewards/tag_count_reward/std": 0.27084073424339294, + "rewards/tag_count_reward/mean": 0.6171875, + "rewards/tag_count_reward/std": 0.2827172875404358, "step": 155 }, { @@ -4510,27 +4510,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.17578125, + "completions/clipped_ratio": 0.326171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 1096.755859375, - "completions/mean_terminated_length": 893.8839111328125, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1309.73046875, + "completions/mean_terminated_length": 952.365234375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, "epoch": 0.05325595288896475, - "grad_norm": 0.1420031189918518, - "kl": 0.032135009765625, - "learning_rate": 5.308219178082192e-07, - "loss": 0.221, - "num_tokens": 112086137.0, - "reward": 0.98486328125, - "reward_std": 0.36912697553634644, - "rewards/accuracy_reward/mean": 0.1484375, - "rewards/accuracy_reward/std": 0.35588082671165466, + "grad_norm": 0.1262756884098053, + "kl": 0.00704193115234375, + "learning_rate": 5.290102389078498e-07, + "loss": 0.1845, + "num_tokens": 140846342.0, + "reward": 0.6826171875, + "reward_std": 0.3658175468444824, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83642578125, - "rewards/tag_count_reward/std": 0.28148388862609863, + "rewards/tag_count_reward/mean": 0.5751953125, + "rewards/tag_count_reward/std": 0.28747493028640747, "step": 156 }, { @@ -4539,27 +4539,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.23046875, + "completions/clipped_ratio": 0.322265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 1106.5625, - "completions/mean_terminated_length": 824.609130859375, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1266.55078125, + "completions/mean_terminated_length": 894.96826171875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, "epoch": 0.05359733720235555, - "grad_norm": 0.16936340928077698, - "kl": 0.032470703125, - "learning_rate": 5.342465753424658e-07, - "loss": 0.2465, - "num_tokens": 112722537.0, - "reward": 0.96142578125, - "reward_std": 0.337054967880249, - "rewards/accuracy_reward/mean": 0.15625, - "rewards/accuracy_reward/std": 0.36344730854034424, + "grad_norm": 0.14502325654029846, + "kl": 0.0071258544921875, + "learning_rate": 5.324232081911263e-07, + "loss": 0.1829, + "num_tokens": 141564656.0, + "reward": 0.7021484375, + "reward_std": 0.33567070960998535, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80517578125, - "rewards/tag_count_reward/std": 0.29840993881225586, + "rewards/tag_count_reward/mean": 0.5908203125, + "rewards/tag_count_reward/std": 0.27856162190437317, "step": 157 }, { @@ -4568,27 +4568,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.19921875, + "completions/clipped_ratio": 0.302734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 1127.84765625, - "completions/mean_terminated_length": 898.9317016601562, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1280.9453125, + "completions/mean_terminated_length": 947.910400390625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.05393872151574635, - "grad_norm": 0.13756844401359558, - "kl": 0.031005859375, - "learning_rate": 5.376712328767123e-07, - "loss": 0.2062, - "num_tokens": 113380459.0, - "reward": 0.89306640625, - "reward_std": 0.32475483417510986, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, + "grad_norm": 0.12533237040042877, + "kl": 0.00719451904296875, + "learning_rate": 5.358361774744027e-07, + "loss": 0.1363, + "num_tokens": 142300964.0, + "reward": 0.689453125, + "reward_std": 0.3115350604057312, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80908203125, - "rewards/tag_count_reward/std": 0.2943541407585144, + "rewards/tag_count_reward/mean": 0.61328125, + "rewards/tag_count_reward/std": 0.28301453590393066, "step": 158 }, { @@ -4597,27 +4597,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1640625, + "completions/clipped_ratio": 0.345703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 1088.658203125, - "completions/mean_terminated_length": 900.3761596679688, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1340.380859375, + "completions/mean_terminated_length": 966.5044555664062, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.05428010582913715, - "grad_norm": 0.1365545094013214, - "kl": 0.029693603515625, - "learning_rate": 5.410958904109589e-07, - "loss": 0.2414, - "num_tokens": 114018076.0, - "reward": 0.94287109375, - "reward_std": 0.3527100682258606, - "rewards/accuracy_reward/mean": 0.10685484111309052, - "rewards/accuracy_reward/std": 0.30924052000045776, + "grad_norm": 0.11826056987047195, + "kl": 0.00634002685546875, + "learning_rate": 5.392491467576792e-07, + "loss": 0.1826, + "num_tokens": 143067463.0, + "reward": 0.6484375, + "reward_std": 0.33227866888046265, + "rewards/accuracy_reward/mean": 0.06854838877916336, + "rewards/accuracy_reward/std": 0.25293970108032227, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83935546875, - "rewards/tag_count_reward/std": 0.2730543613433838, + "rewards/tag_count_reward/mean": 0.58203125, + "rewards/tag_count_reward/std": 0.28171506524086, "step": 159 }, { @@ -4626,27 +4626,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.19921875, + "completions/clipped_ratio": 0.32421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 1097.521484375, - "completions/mean_terminated_length": 861.0609741210938, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1257.80859375, + "completions/mean_terminated_length": 878.6994018554688, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, "epoch": 0.05462149014252795, - "grad_norm": 0.14326536655426025, - "kl": 0.027130126953125, - "learning_rate": 5.445205479452054e-07, - "loss": 0.2094, - "num_tokens": 114656919.0, - "reward": 1.00439453125, - "reward_std": 0.3649257719516754, - "rewards/accuracy_reward/mean": 0.17578125, - "rewards/accuracy_reward/std": 0.3810062110424042, + "grad_norm": 0.14891350269317627, + "kl": 0.00669097900390625, + "learning_rate": 5.426621160409555e-07, + "loss": 0.2124, + "num_tokens": 143788373.0, + "reward": 0.7451171875, + "reward_std": 0.3568500876426697, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.82861328125, - "rewards/tag_count_reward/std": 0.2901626229286194, + "rewards/tag_count_reward/mean": 0.5947265625, + "rewards/tag_count_reward/std": 0.2906089127063751, "step": 160 }, { @@ -4655,27 +4655,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.208984375, + "completions/clipped_ratio": 0.267578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1130.7265625, - "completions/mean_terminated_length": 888.38525390625, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 1199.90234375, + "completions/mean_terminated_length": 890.06396484375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, "epoch": 0.05496287445591875, - "grad_norm": 0.13187538087368011, - "kl": 0.03009033203125, - "learning_rate": 5.47945205479452e-07, - "loss": 0.1951, - "num_tokens": 115317627.0, - "reward": 0.9404296875, - "reward_std": 0.3498813509941101, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, + "grad_norm": 0.12351232022047043, + "kl": 0.007415771484375, + "learning_rate": 5.46075085324232e-07, + "loss": 0.1958, + "num_tokens": 144484499.0, + "reward": 0.7333984375, + "reward_std": 0.3267363905906677, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8134765625, - "rewards/tag_count_reward/std": 0.29613298177719116, + "rewards/tag_count_reward/mean": 0.6376953125, + "rewards/tag_count_reward/std": 0.2919080853462219, "step": 161 }, { @@ -4684,27 +4684,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.126953125, + "completions/clipped_ratio": 0.26171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 969.990234375, - "completions/mean_terminated_length": 813.232666015625, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1179.337890625, + "completions/mean_terminated_length": 871.3994140625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.05530425876930955, - "grad_norm": 0.14999741315841675, - "kl": 0.030303955078125, - "learning_rate": 5.513698630136986e-07, - "loss": 0.1619, - "num_tokens": 115890934.0, - "reward": 0.99169921875, - "reward_std": 0.30944162607192993, - "rewards/accuracy_reward/mean": 0.134765625, - "rewards/accuracy_reward/std": 0.3418070077896118, + "grad_norm": 0.1453288346529007, + "kl": 0.0077362060546875, + "learning_rate": 5.494880546075085e-07, + "loss": 0.2039, + "num_tokens": 145164992.0, + "reward": 0.78076171875, + "reward_std": 0.34023576974868774, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85693359375, - "rewards/tag_count_reward/std": 0.25873351097106934, + "rewards/tag_count_reward/mean": 0.64013671875, + "rewards/tag_count_reward/std": 0.2815517783164978, "step": 162 }, { @@ -4713,27 +4713,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.203125, + "completions/clipped_ratio": 0.294921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1132.5078125, - "completions/mean_terminated_length": 899.1470947265625, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1237.84375, + "completions/mean_terminated_length": 898.969482421875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, "epoch": 0.05564564308270035, - "grad_norm": 0.1272873431444168, - "kl": 0.0289306640625, - "learning_rate": 5.547945205479452e-07, - "loss": 0.1552, - "num_tokens": 116547306.0, - "reward": 0.95068359375, - "reward_std": 0.338924765586853, - "rewards/accuracy_reward/mean": 0.1484375, - "rewards/accuracy_reward/std": 0.35588082671165466, + "grad_norm": 0.1276940554380417, + "kl": 0.008148193359375, + "learning_rate": 5.52901023890785e-07, + "loss": 0.1519, + "num_tokens": 145875296.0, + "reward": 0.71484375, + "reward_std": 0.33005768060684204, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80224609375, - "rewards/tag_count_reward/std": 0.2944028377532959, + "rewards/tag_count_reward/mean": 0.6171875, + "rewards/tag_count_reward/std": 0.2814164161682129, "step": 163 }, { @@ -4742,27 +4742,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.17578125, + "completions/clipped_ratio": 0.26953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1105.451171875, - "completions/mean_terminated_length": 904.4336547851562, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1255.318359375, + "completions/mean_terminated_length": 962.83154296875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, "epoch": 0.05598702739609115, - "grad_norm": 0.1345742791891098, - "kl": 0.028045654296875, - "learning_rate": 5.582191780821918e-07, - "loss": 0.1839, - "num_tokens": 117191233.0, - "reward": 0.99365234375, - "reward_std": 0.3265041708946228, - "rewards/accuracy_reward/mean": 0.1640625, - "rewards/accuracy_reward/std": 0.37069445848464966, + "grad_norm": 0.1260259747505188, + "kl": 0.00727081298828125, + "learning_rate": 5.563139931740614e-07, + "loss": 0.1846, + "num_tokens": 146595955.0, + "reward": 0.79052734375, + "reward_std": 0.3370605707168579, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.82958984375, - "rewards/tag_count_reward/std": 0.2782711982727051, + "rewards/tag_count_reward/mean": 0.62841796875, + "rewards/tag_count_reward/std": 0.2862437069416046, "step": 164 }, { @@ -4771,27 +4771,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.224609375, + "completions/clipped_ratio": 0.3203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1171.6640625, - "completions/mean_terminated_length": 917.8135375976562, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1338.419921875, + "completions/mean_terminated_length": 1004.0201416015625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.05632841170948195, - "grad_norm": 0.13051313161849976, - "kl": 0.026824951171875, - "learning_rate": 5.616438356164383e-07, - "loss": 0.2358, - "num_tokens": 117867045.0, - "reward": 0.91064453125, - "reward_std": 0.34272146224975586, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, + "grad_norm": 0.11814326047897339, + "kl": 0.007232666015625, + "learning_rate": 5.597269624573379e-07, + "loss": 0.1865, + "num_tokens": 147357146.0, + "reward": 0.71484375, + "reward_std": 0.3613835871219635, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80322265625, - "rewards/tag_count_reward/std": 0.2983555197715759, + "rewards/tag_count_reward/mean": 0.61328125, + "rewards/tag_count_reward/std": 0.2864510118961334, "step": 165 }, { @@ -4800,27 +4800,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.19140625, + "completions/clipped_ratio": 0.224609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 1097.896484375, - "completions/mean_terminated_length": 872.9927368164062, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1120.482421875, + "completions/mean_terminated_length": 851.8060302734375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, "epoch": 0.05666979602287275, - "grad_norm": 0.12242324650287628, - "kl": 0.027252197265625, - "learning_rate": 5.65068493150685e-07, - "loss": 0.1292, - "num_tokens": 118506240.0, - "reward": 0.9541015625, - "reward_std": 0.33134326338768005, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, + "grad_norm": 0.13714024424552917, + "kl": 0.009918212890625, + "learning_rate": 5.631399317406143e-07, + "loss": 0.1799, + "num_tokens": 148007905.0, + "reward": 0.78271484375, + "reward_std": 0.3490524888038635, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8271484375, - "rewards/tag_count_reward/std": 0.2878071367740631, + "rewards/tag_count_reward/mean": 0.66357421875, + "rewards/tag_count_reward/std": 0.2753334939479828, "step": 166 }, { @@ -4829,27 +4829,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.150390625, + "completions/clipped_ratio": 0.2265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 1049.802734375, - "completions/mean_terminated_length": 873.1103515625, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1162.4609375, + "completions/mean_terminated_length": 903.0606079101562, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, "epoch": 0.057011180336263546, - "grad_norm": 0.13246352970600128, - "kl": 0.030670166015625, - "learning_rate": 5.684931506849316e-07, - "loss": 0.2013, - "num_tokens": 119114347.0, - "reward": 1.01171875, - "reward_std": 0.3280482292175293, - "rewards/accuracy_reward/mean": 0.15234375, - "rewards/accuracy_reward/std": 0.35970520973205566, + "grad_norm": 0.13401290774345398, + "kl": 0.0093841552734375, + "learning_rate": 5.665529010238907e-07, + "loss": 0.2036, + "num_tokens": 148673693.0, + "reward": 0.79443359375, + "reward_std": 0.3502683639526367, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.859375, - "rewards/tag_count_reward/std": 0.26588478684425354, + "rewards/tag_count_reward/mean": 0.67529296875, + "rewards/tag_count_reward/std": 0.28738975524902344, "step": 167 }, { @@ -4858,27 +4858,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.248046875, + "completions/clipped_ratio": 0.349609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 1184.060546875, - "completions/mean_terminated_length": 899.0726928710938, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1301.564453125, + "completions/mean_terminated_length": 900.3273315429688, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, "epoch": 0.057352564649654346, - "grad_norm": 0.14424820244312286, - "kl": 0.0301513671875, - "learning_rate": 5.71917808219178e-07, - "loss": 0.2438, - "num_tokens": 119807194.0, - "reward": 0.85400390625, - "reward_std": 0.33242154121398926, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 0.13131964206695557, + "kl": 0.008575439453125, + "learning_rate": 5.699658703071673e-07, + "loss": 0.2153, + "num_tokens": 149426702.0, + "reward": 0.66162109375, + "reward_std": 0.2989290952682495, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.78759765625, - "rewards/tag_count_reward/std": 0.30875593423843384, + "rewards/tag_count_reward/mean": 0.60888671875, + "rewards/tag_count_reward/std": 0.2934112250804901, "step": 168 }, { @@ -4887,27 +4887,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.18359375, + "completions/clipped_ratio": 0.212890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1022.5390625, - "completions/mean_terminated_length": 791.9329833984375, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1105.23828125, + "completions/mean_terminated_length": 850.2481079101562, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.057693948963045145, - "grad_norm": 0.1473248302936554, - "kl": 0.031097412109375, - "learning_rate": 5.753424657534246e-07, - "loss": 0.2123, - "num_tokens": 120404094.0, - "reward": 1.0205078125, - "reward_std": 0.38394251465797424, - "rewards/accuracy_reward/mean": 0.193359375, - "rewards/accuracy_reward/std": 0.39531853795051575, + "grad_norm": 0.14257201552391052, + "kl": 0.010467529296875, + "learning_rate": 5.733788395904437e-07, + "loss": 0.174, + "num_tokens": 150065944.0, + "reward": 0.8291015625, + "reward_std": 0.3473888635635376, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8271484375, - "rewards/tag_count_reward/std": 0.28695595264434814, + "rewards/tag_count_reward/mean": 0.6826171875, + "rewards/tag_count_reward/std": 0.28039422631263733, "step": 169 }, { @@ -4916,27 +4916,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.20703125, + "completions/clipped_ratio": 0.28515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 1166.875, - "completions/mean_terminated_length": 936.8275756835938, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1236.380859375, + "completions/mean_terminated_length": 912.6201782226562, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, "epoch": 0.058035333276435945, - "grad_norm": 0.13750946521759033, - "kl": 0.0291748046875, - "learning_rate": 5.787671232876712e-07, - "loss": 0.2577, - "num_tokens": 121080414.0, - "reward": 0.8798828125, - "reward_std": 0.33640050888061523, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, + "grad_norm": 0.13205033540725708, + "kl": 0.008819580078125, + "learning_rate": 5.767918088737202e-07, + "loss": 0.2266, + "num_tokens": 150777851.0, + "reward": 0.7001953125, + "reward_std": 0.3145097494125366, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8212890625, - "rewards/tag_count_reward/std": 0.2926796078681946, + "rewards/tag_count_reward/mean": 0.6513671875, + "rewards/tag_count_reward/std": 0.2980498969554901, "step": 170 }, { @@ -4945,27 +4945,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1875, + "completions/clipped_ratio": 0.255859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1107.908203125, - "completions/mean_terminated_length": 890.9639892578125, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1175.07421875, + "completions/mean_terminated_length": 874.9343872070312, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, "epoch": 0.058376717589826745, - "grad_norm": 0.11815839260816574, - "kl": 0.029205322265625, - "learning_rate": 5.821917808219177e-07, - "loss": 0.1419, - "num_tokens": 121725167.0, - "reward": 0.94775390625, - "reward_std": 0.3215233087539673, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, + "grad_norm": 0.13501673936843872, + "kl": 0.00872802734375, + "learning_rate": 5.802047781569965e-07, + "loss": 0.1589, + "num_tokens": 151456993.0, + "reward": 0.77197265625, + "reward_std": 0.34662991762161255, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83642578125, - "rewards/tag_count_reward/std": 0.2853677570819855, + "rewards/tag_count_reward/mean": 0.65283203125, + "rewards/tag_count_reward/std": 0.29043254256248474, "step": 171 }, { @@ -4974,27 +4974,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.185546875, + "completions/clipped_ratio": 0.240234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 1102.84375, - "completions/mean_terminated_length": 887.5203857421875, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1159.556640625, + "completions/mean_terminated_length": 878.6349487304688, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.058718101903217544, - "grad_norm": 0.14123791456222534, - "kl": 0.0301513671875, - "learning_rate": 5.856164383561644e-07, - "loss": 0.1557, - "num_tokens": 122375455.0, - "reward": 0.93408203125, - "reward_std": 0.3450877368450165, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, + "grad_norm": 0.14753416180610657, + "kl": 0.0096893310546875, + "learning_rate": 5.83617747440273e-07, + "loss": 0.1648, + "num_tokens": 152136318.0, + "reward": 0.77978515625, + "reward_std": 0.35575854778289795, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.82861328125, - "rewards/tag_count_reward/std": 0.27722880244255066, + "rewards/tag_count_reward/mean": 0.66845703125, + "rewards/tag_count_reward/std": 0.29145723581314087, "step": 172 }, { @@ -5003,27 +5003,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2109375, + "completions/clipped_ratio": 0.27734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1187.037109375, - "completions/mean_terminated_length": 956.8787231445312, - "completions/min_length": 83.0, - "completions/min_terminated_length": 83.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1245.98828125, + "completions/mean_terminated_length": 938.189208984375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, "epoch": 0.059059486216608344, - "grad_norm": 0.11834447830915451, - "kl": 0.028106689453125, - "learning_rate": 5.89041095890411e-07, - "loss": 0.1168, - "num_tokens": 123058930.0, - "reward": 0.9228515625, - "reward_std": 0.3344650864601135, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, + "grad_norm": 0.1311158388853073, + "kl": 0.0095977783203125, + "learning_rate": 5.870307167235494e-07, + "loss": 0.1818, + "num_tokens": 152849976.0, + "reward": 0.73486328125, + "reward_std": 0.34034109115600586, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8232421875, - "rewards/tag_count_reward/std": 0.28755471110343933, + "rewards/tag_count_reward/mean": 0.66455078125, + "rewards/tag_count_reward/std": 0.29159489274024963, "step": 173 }, { @@ -5032,27 +5032,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2421875, + "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1200.091796875, - "completions/mean_terminated_length": 929.1107788085938, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1167.24609375, + "completions/mean_terminated_length": 920.635009765625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, "epoch": 0.05940087052999914, - "grad_norm": 0.13264034688472748, - "kl": 0.02886962890625, - "learning_rate": 5.924657534246575e-07, - "loss": 0.2176, - "num_tokens": 123753393.0, - "reward": 0.99560546875, - "reward_std": 0.39991408586502075, - "rewards/accuracy_reward/mean": 0.19959677755832672, - "rewards/accuracy_reward/std": 0.40010079741477966, + "grad_norm": 0.13743795454502106, + "kl": 0.010498046875, + "learning_rate": 5.904436860068259e-07, + "loss": 0.1982, + "num_tokens": 153527622.0, + "reward": 0.8349609375, + "reward_std": 0.37249088287353516, + "rewards/accuracy_reward/mean": 0.15120968222618103, + "rewards/accuracy_reward/std": 0.35861483216285706, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80224609375, - "rewards/tag_count_reward/std": 0.29357075691223145, + "rewards/tag_count_reward/mean": 0.6884765625, + "rewards/tag_count_reward/std": 0.28260740637779236, "step": 174 }, { @@ -5061,27 +5061,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12109375, + "completions/clipped_ratio": 0.126953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1051.470703125, - "completions/mean_terminated_length": 914.171142578125, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 993.3046875, + "completions/mean_terminated_length": 839.9373779296875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.05974225484338994, - "grad_norm": 0.13750015199184418, - "kl": 0.029754638671875, - "learning_rate": 5.958904109589041e-07, - "loss": 0.2035, - "num_tokens": 124369490.0, - "reward": 1.00341796875, - "reward_std": 0.35021504759788513, - "rewards/accuracy_reward/mean": 0.142578125, - "rewards/accuracy_reward/std": 0.3499840497970581, + "grad_norm": 0.1527203768491745, + "kl": 0.0121307373046875, + "learning_rate": 5.938566552901024e-07, + "loss": 0.1838, + "num_tokens": 154113938.0, + "reward": 0.83935546875, + "reward_std": 0.3321494460105896, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86083984375, - "rewards/tag_count_reward/std": 0.2556498050689697, + "rewards/tag_count_reward/mean": 0.73974609375, + "rewards/tag_count_reward/std": 0.2668350040912628, "step": 175 }, { @@ -5090,27 +5090,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1328125, + "completions/clipped_ratio": 0.142578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 994.48828125, - "completions/mean_terminated_length": 833.1396484375, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1011.345703125, + "completions/mean_terminated_length": 838.9635620117188, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, "epoch": 0.06008363915678075, - "grad_norm": 0.14909285306930542, - "kl": 0.029937744140625, - "learning_rate": 5.993150684931506e-07, - "loss": 0.1983, - "num_tokens": 124958524.0, - "reward": 0.98681640625, - "reward_std": 0.3174964189529419, - "rewards/accuracy_reward/mean": 0.12298387289047241, - "rewards/accuracy_reward/std": 0.32875028252601624, + "grad_norm": 0.16555017232894897, + "kl": 0.01165771484375, + "learning_rate": 5.972696245733788e-07, + "loss": 0.1714, + "num_tokens": 154711603.0, + "reward": 0.828125, + "reward_std": 0.3264949917793274, + "rewards/accuracy_reward/mean": 0.0947580635547638, + "rewards/accuracy_reward/std": 0.29317617416381836, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86767578125, - "rewards/tag_count_reward/std": 0.2515996992588043, + "rewards/tag_count_reward/mean": 0.736328125, + "rewards/tag_count_reward/std": 0.2759232819080353, "step": 176 }, { @@ -5119,27 +5119,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1953125, + "completions/clipped_ratio": 0.216796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1125.064453125, - "completions/mean_terminated_length": 901.0509643554688, - "completions/min_length": 72.0, - "completions/min_terminated_length": 72.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1123.158203125, + "completions/mean_terminated_length": 867.1546630859375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, "epoch": 0.06042502347017155, - "grad_norm": 0.1393154114484787, - "kl": 0.03125, - "learning_rate": 6.027397260273972e-07, - "loss": 0.2254, - "num_tokens": 125612029.0, - "reward": 0.96240234375, - "reward_std": 0.35647714138031006, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, + "grad_norm": 0.15225385129451752, + "kl": 0.012969970703125, + "learning_rate": 6.006825938566553e-07, + "loss": 0.2312, + "num_tokens": 155364132.0, + "reward": 0.85302734375, + "reward_std": 0.3661240041255951, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83544921875, - "rewards/tag_count_reward/std": 0.28178247809410095, + "rewards/tag_count_reward/mean": 0.70654296875, + "rewards/tag_count_reward/std": 0.28829270601272583, "step": 177 }, { @@ -5148,27 +5148,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.205078125, + "completions/clipped_ratio": 0.228515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1169.005859375, - "completions/mean_terminated_length": 942.23828125, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1152.826171875, + "completions/mean_terminated_length": 887.6734008789062, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, "epoch": 0.06076640778356235, - "grad_norm": 0.12574030458927155, - "kl": 0.031005859375, - "learning_rate": 6.061643835616438e-07, - "loss": 0.1577, - "num_tokens": 126289696.0, - "reward": 0.9912109375, - "reward_std": 0.3792455494403839, - "rewards/accuracy_reward/mean": 0.16796875, - "rewards/accuracy_reward/std": 0.374204158782959, + "grad_norm": 0.14308057725429535, + "kl": 0.0115203857421875, + "learning_rate": 6.040955631399317e-07, + "loss": 0.2071, + "num_tokens": 156033515.0, + "reward": 0.83935546875, + "reward_std": 0.3647027909755707, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8232421875, - "rewards/tag_count_reward/std": 0.2854200601577759, + "rewards/tag_count_reward/mean": 0.70849609375, + "rewards/tag_count_reward/std": 0.29362282156944275, "step": 178 }, { @@ -5177,27 +5177,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.216796875, + "completions/clipped_ratio": 0.271484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1948.0, - "completions/mean_length": 1147.837890625, - "completions/mean_terminated_length": 898.6658935546875, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1181.66015625, + "completions/mean_terminated_length": 858.8150634765625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, "epoch": 0.06110779209695315, - "grad_norm": 0.11976183205842972, - "kl": 0.0303955078125, - "learning_rate": 6.095890410958904e-07, - "loss": 0.1866, - "num_tokens": 126951517.0, - "reward": 0.89794921875, - "reward_std": 0.30390793085098267, - "rewards/accuracy_reward/mean": 0.0947580635547638, - "rewards/accuracy_reward/std": 0.29317617416381836, + "grad_norm": 0.16277040541172028, + "kl": 0.0136260986328125, + "learning_rate": 6.075085324232082e-07, + "loss": 0.1894, + "num_tokens": 156712653.0, + "reward": 0.7724609375, + "reward_std": 0.3247779905796051, + "rewards/accuracy_reward/mean": 0.09879032522439957, + "rewards/accuracy_reward/std": 0.2986815273761749, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80615234375, - "rewards/tag_count_reward/std": 0.3010845482349396, + "rewards/tag_count_reward/mean": 0.6767578125, + "rewards/tag_count_reward/std": 0.30848538875579834, "step": 179 }, { @@ -5206,27 +5206,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.166015625, + "completions/clipped_ratio": 0.24609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 1123.68359375, - "completions/mean_terminated_length": 939.6861572265625, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1166.521484375, + "completions/mean_terminated_length": 878.7849731445312, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, "epoch": 0.06144917641034395, - "grad_norm": 0.13634192943572998, - "kl": 0.0311279296875, - "learning_rate": 6.13013698630137e-07, - "loss": 0.1713, - "num_tokens": 127615931.0, - "reward": 0.908203125, - "reward_std": 0.32293522357940674, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, + "grad_norm": 0.14448601007461548, + "kl": 0.012359619140625, + "learning_rate": 6.109215017064846e-07, + "loss": 0.1842, + "num_tokens": 157399000.0, + "reward": 0.7685546875, + "reward_std": 0.32453253865242004, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.837890625, - "rewards/tag_count_reward/std": 0.27642151713371277, + "rewards/tag_count_reward/mean": 0.6923828125, + "rewards/tag_count_reward/std": 0.2915150225162506, "step": 180 }, { @@ -5235,27 +5235,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.169921875, + "completions/clipped_ratio": 0.19921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 1028.015625, - "completions/mean_terminated_length": 819.2188110351562, - "completions/min_length": 65.0, - "completions/min_terminated_length": 65.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 992.98046875, + "completions/mean_terminated_length": 730.51220703125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, "epoch": 0.06179056072373475, - "grad_norm": 0.16368067264556885, - "kl": 0.03240966796875, - "learning_rate": 6.164383561643835e-07, - "loss": 0.2359, - "num_tokens": 128223923.0, - "reward": 0.98193359375, - "reward_std": 0.3258340358734131, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, + "grad_norm": 0.1693628877401352, + "kl": 0.01519775390625, + "learning_rate": 6.143344709897611e-07, + "loss": 0.225, + "num_tokens": 157989054.0, + "reward": 0.8662109375, + "reward_std": 0.30623167753219604, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84130859375, - "rewards/tag_count_reward/std": 0.27150672674179077, + "rewards/tag_count_reward/mean": 0.7392578125, + "rewards/tag_count_reward/std": 0.2804487645626068, "step": 181 }, { @@ -5264,27 +5264,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.234375, + "completions/clipped_ratio": 0.2421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 1265.26953125, - "completions/mean_terminated_length": 1025.658203125, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1212.3984375, + "completions/mean_terminated_length": 945.3504638671875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, "epoch": 0.06213194503712555, - "grad_norm": 0.1233213022351265, - "kl": 0.024993896484375, - "learning_rate": 6.198630136986301e-07, - "loss": 0.1866, - "num_tokens": 128949901.0, - "reward": 0.8642578125, - "reward_std": 0.33978962898254395, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, + "grad_norm": 0.14012351632118225, + "kl": 0.0143890380859375, + "learning_rate": 6.177474402730375e-07, + "loss": 0.185, + "num_tokens": 158687962.0, + "reward": 0.7490234375, + "reward_std": 0.30691662430763245, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7939453125, - "rewards/tag_count_reward/std": 0.3033125400543213, + "rewards/tag_count_reward/mean": 0.7060546875, + "rewards/tag_count_reward/std": 0.29720231890678406, "step": 182 }, { @@ -5293,27 +5293,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.177734375, + "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1060.158203125, - "completions/mean_terminated_length": 846.6342163085938, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 1030.939453125, + "completions/mean_terminated_length": 771.6887817382812, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, "epoch": 0.062473329350516346, - "grad_norm": 0.13886182010173798, - "kl": 0.035919189453125, - "learning_rate": 6.232876712328768e-07, - "loss": 0.1899, - "num_tokens": 129569054.0, - "reward": 0.90283203125, - "reward_std": 0.3168081045150757, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 0.20735539495944977, + "kl": 0.0152130126953125, + "learning_rate": 6.21160409556314e-07, + "loss": 0.2826, + "num_tokens": 159292155.0, + "reward": 0.80712890625, + "reward_std": 0.33187347650527954, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83642578125, - "rewards/tag_count_reward/std": 0.2744435966014862, + "rewards/tag_count_reward/mean": 0.73681640625, + "rewards/tag_count_reward/std": 0.28660064935684204, "step": 183 }, { @@ -5322,27 +5322,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.201171875, + "completions/clipped_ratio": 0.23046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1194.966796875, - "completions/mean_terminated_length": 980.144287109375, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1164.501953125, + "completions/mean_terminated_length": 899.9010009765625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, "epoch": 0.06281471366390715, - "grad_norm": 0.12694279849529266, - "kl": 0.0264892578125, - "learning_rate": 6.267123287671232e-07, - "loss": 0.1723, - "num_tokens": 130256861.0, - "reward": 0.94189453125, - "reward_std": 0.3124285340309143, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, + "grad_norm": 0.15091517567634583, + "kl": 0.0132904052734375, + "learning_rate": 6.245733788395904e-07, + "loss": 0.1759, + "num_tokens": 159964364.0, + "reward": 0.822265625, + "reward_std": 0.3314219117164612, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.82275390625, - "rewards/tag_count_reward/std": 0.2900077998638153, + "rewards/tag_count_reward/mean": 0.728515625, + "rewards/tag_count_reward/std": 0.29762157797813416, "step": 184 }, { @@ -5351,27 +5351,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1953125, + "completions/clipped_ratio": 0.208984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 1163.544921875, - "completions/mean_terminated_length": 948.871337890625, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1129.025390625, + "completions/mean_terminated_length": 886.234619140625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, "epoch": 0.06315609797729795, - "grad_norm": 0.13889499008655548, - "kl": 0.030975341796875, - "learning_rate": 6.301369863013698e-07, - "loss": 0.1728, - "num_tokens": 130932100.0, - "reward": 0.97265625, - "reward_std": 0.3136816918849945, - "rewards/accuracy_reward/mean": 0.15927419066429138, - "rewards/accuracy_reward/std": 0.3663010001182556, + "grad_norm": 0.1488422006368637, + "kl": 0.0141754150390625, + "learning_rate": 6.279863481228669e-07, + "loss": 0.2132, + "num_tokens": 160621929.0, + "reward": 0.8271484375, + "reward_std": 0.3353453278541565, + "rewards/accuracy_reward/mean": 0.1088709682226181, + "rewards/accuracy_reward/std": 0.31179171800613403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.818359375, - "rewards/tag_count_reward/std": 0.2831968069076538, + "rewards/tag_count_reward/mean": 0.7216796875, + "rewards/tag_count_reward/std": 0.2916460931301117, "step": 185 }, { @@ -5380,27 +5380,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.21484375, + "completions/clipped_ratio": 0.240234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 1212.626953125, - "completions/mean_terminated_length": 984.042236328125, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1129.703125, + "completions/mean_terminated_length": 839.3419189453125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, "epoch": 0.06349748229068874, - "grad_norm": 0.13234475255012512, - "kl": 0.027008056640625, - "learning_rate": 6.335616438356164e-07, - "loss": 0.1543, - "num_tokens": 131633621.0, - "reward": 1.02880859375, - "reward_std": 0.3996187746524811, - "rewards/accuracy_reward/mean": 0.212890625, - "rewards/accuracy_reward/std": 0.409751296043396, + "grad_norm": 8.422516822814941, + "kl": 0.09814453125, + "learning_rate": 6.313993174061433e-07, + "loss": 0.1775, + "num_tokens": 161280993.0, + "reward": 0.89892578125, + "reward_std": 0.38806048035621643, + "rewards/accuracy_reward/mean": 0.173828125, + "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.81591796875, - "rewards/tag_count_reward/std": 0.298685222864151, + "rewards/tag_count_reward/mean": 0.72509765625, + "rewards/tag_count_reward/std": 0.30083054304122925, "step": 186 }, { @@ -5409,27 +5409,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.146484375, + "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1028.087890625, - "completions/mean_terminated_length": 853.0457153320312, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 990.580078125, + "completions/mean_terminated_length": 780.0866088867188, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, "epoch": 0.06383886660407954, - "grad_norm": 0.1387174427509308, - "kl": 0.030181884765625, - "learning_rate": 6.369863013698629e-07, - "loss": 0.1557, - "num_tokens": 132234946.0, - "reward": 1.0439453125, - "reward_std": 0.33869534730911255, - "rewards/accuracy_reward/mean": 0.185546875, - "rewards/accuracy_reward/std": 0.38912075757980347, + "grad_norm": 0.17361460626125336, + "kl": 0.0167999267578125, + "learning_rate": 6.348122866894197e-07, + "loss": 0.1832, + "num_tokens": 161863114.0, + "reward": 0.947265625, + "reward_std": 0.34387677907943726, + "rewards/accuracy_reward/mean": 0.169921875, + "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8583984375, - "rewards/tag_count_reward/std": 0.25977516174316406, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.2766771912574768, "step": 187 }, { @@ -5438,27 +5438,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.16015625, + "completions/clipped_ratio": 0.212890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1202.458984375, - "completions/mean_terminated_length": 1041.21630859375, - "completions/min_length": 197.0, - "completions/min_terminated_length": 197.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 1148.482421875, + "completions/mean_terminated_length": 905.1885986328125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.06418025091747034, - "grad_norm": 0.1355055719614029, - "kl": 0.03045654296875, - "learning_rate": 6.404109589041096e-07, - "loss": 0.1949, - "num_tokens": 132928477.0, - "reward": 0.92333984375, - "reward_std": 0.3060658872127533, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, + "grad_norm": 0.13827481865882874, + "kl": 0.0141448974609375, + "learning_rate": 6.382252559726961e-07, + "loss": 0.1854, + "num_tokens": 162529009.0, + "reward": 0.8095703125, + "reward_std": 0.2875130772590637, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84326171875, - "rewards/tag_count_reward/std": 0.272641122341156, + "rewards/tag_count_reward/mean": 0.7451171875, + "rewards/tag_count_reward/std": 0.2891981899738312, "step": 188 }, { @@ -5467,27 +5467,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.171875, + "completions/clipped_ratio": 0.20703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1094.248046875, - "completions/mean_terminated_length": 896.299560546875, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1068.818359375, + "completions/mean_terminated_length": 813.169921875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, "epoch": 0.06452163523086114, - "grad_norm": 0.14412641525268555, - "kl": 0.029541015625, - "learning_rate": 6.438356164383562e-07, - "loss": 0.2264, - "num_tokens": 133565260.0, - "reward": 0.9443359375, - "reward_std": 0.319268137216568, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, + "grad_norm": 0.15764501690864563, + "kl": 0.0146942138671875, + "learning_rate": 6.416382252559727e-07, + "loss": 0.2003, + "num_tokens": 163152772.0, + "reward": 0.830078125, + "reward_std": 0.3172072172164917, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8310546875, - "rewards/tag_count_reward/std": 0.28587502241134644, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.29382818937301636, "step": 189 }, { @@ -5496,27 +5496,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.203125, + "completions/clipped_ratio": 0.185546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 1186.87890625, - "completions/mean_terminated_length": 967.3775024414062, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1099.25390625, + "completions/mean_terminated_length": 883.1127319335938, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, "epoch": 0.06486301954425194, - "grad_norm": 0.12044928967952728, - "kl": 0.0277099609375, - "learning_rate": 6.472602739726027e-07, - "loss": 0.1837, - "num_tokens": 134245438.0, - "reward": 0.9306640625, - "reward_std": 0.3642945885658264, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, + "grad_norm": 0.13573741912841797, + "kl": 0.0135650634765625, + "learning_rate": 6.450511945392492e-07, + "loss": 0.1951, + "num_tokens": 163788086.0, + "reward": 0.85888671875, + "reward_std": 0.35918933153152466, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8310546875, - "rewards/tag_count_reward/std": 0.2854468524456024, + "rewards/tag_count_reward/mean": 0.75732421875, + "rewards/tag_count_reward/std": 0.2855284512042999, "step": 190 }, { @@ -5525,27 +5525,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.166015625, + "completions/clipped_ratio": 0.228515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1131.662109375, - "completions/mean_terminated_length": 949.2528686523438, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1127.689453125, + "completions/mean_terminated_length": 855.0911254882812, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, "epoch": 0.06520440385764274, - "grad_norm": 0.13368235528469086, - "kl": 0.028045654296875, - "learning_rate": 6.506849315068493e-07, - "loss": 0.1476, - "num_tokens": 134906465.0, - "reward": 1.0322265625, - "reward_std": 0.31763800978660583, - "rewards/accuracy_reward/mean": 0.162109375, - "rewards/accuracy_reward/std": 0.3689115643501282, + "grad_norm": 0.13883548974990845, + "kl": 0.0126495361328125, + "learning_rate": 6.484641638225256e-07, + "loss": 0.2165, + "num_tokens": 164447079.0, + "reward": 0.9189453125, + "reward_std": 0.3817644417285919, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8701171875, - "rewards/tag_count_reward/std": 0.25789982080459595, + "rewards/tag_count_reward/mean": 0.7509765625, + "rewards/tag_count_reward/std": 0.30003103613853455, "step": 191 }, { @@ -5554,27 +5554,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.173828125, + "completions/clipped_ratio": 0.216796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 1241.54296875, - "completions/mean_terminated_length": 1071.8629150390625, - "completions/min_length": 204.0, - "completions/min_terminated_length": 204.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1212.306640625, + "completions/mean_terminated_length": 980.9801025390625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.06554578817103354, - "grad_norm": 0.11532191932201385, - "kl": 0.0250244140625, - "learning_rate": 6.541095890410958e-07, - "loss": 0.1354, - "num_tokens": 135617511.0, - "reward": 0.947265625, - "reward_std": 0.32218992710113525, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, + "grad_norm": 0.12487567216157913, + "kl": 0.0131988525390625, + "learning_rate": 6.51877133105802e-07, + "loss": 0.1717, + "num_tokens": 165143156.0, + "reward": 0.79248046875, + "reward_std": 0.32984548807144165, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.849609375, - "rewards/tag_count_reward/std": 0.26833879947662354, + "rewards/tag_count_reward/mean": 0.72607421875, + "rewards/tag_count_reward/std": 0.29099130630493164, "step": 192 }, { @@ -5583,27 +5583,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.216796875, + "completions/clipped_ratio": 0.291015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1185.435546875, - "completions/mean_terminated_length": 946.6708374023438, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1198.216796875, + "completions/mean_terminated_length": 849.40771484375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, "epoch": 0.06588717248442434, - "grad_norm": 0.13891267776489258, - "kl": 0.02862548828125, - "learning_rate": 6.575342465753423e-07, - "loss": 0.1904, - "num_tokens": 136308198.0, - "reward": 0.90380859375, - "reward_std": 0.30265724658966064, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, + "grad_norm": 0.14539961516857147, + "kl": 0.01318359375, + "learning_rate": 6.552901023890784e-07, + "loss": 0.2089, + "num_tokens": 165840387.0, + "reward": 0.74609375, + "reward_std": 0.32558518648147583, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.81591796875, - "rewards/tag_count_reward/std": 0.29456183314323425, + "rewards/tag_count_reward/mean": 0.68359375, + "rewards/tag_count_reward/std": 0.30884605646133423, "step": 193 }, { @@ -5612,27 +5612,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.146484375, + "completions/clipped_ratio": 0.19140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1072.626953125, - "completions/mean_terminated_length": 905.2288208007812, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1068.576171875, + "completions/mean_terminated_length": 836.7318725585938, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, "epoch": 0.06622855679781514, - "grad_norm": 0.17406651377677917, - "kl": 0.03094482421875, - "learning_rate": 6.60958904109589e-07, - "loss": 0.1819, - "num_tokens": 136935495.0, - "reward": 0.98876953125, - "reward_std": 0.3156236410140991, - "rewards/accuracy_reward/mean": 0.13306452333927155, - "rewards/accuracy_reward/std": 0.3399873673915863, + "grad_norm": 0.1555928736925125, + "kl": 0.015472412109375, + "learning_rate": 6.587030716723549e-07, + "loss": 0.2185, + "num_tokens": 166465610.0, + "reward": 0.89208984375, + "reward_std": 0.3563552498817444, + "rewards/accuracy_reward/mean": 0.13104838132858276, + "rewards/accuracy_reward/std": 0.3377939462661743, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85986328125, - "rewards/tag_count_reward/std": 0.2522217929363251, + "rewards/tag_count_reward/mean": 0.76513671875, + "rewards/tag_count_reward/std": 0.2953328788280487, "step": 194 }, { @@ -5641,27 +5641,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.228515625, + "completions/clipped_ratio": 0.30859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1188.35546875, - "completions/mean_terminated_length": 933.7265625, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1205.119140625, + "completions/mean_terminated_length": 828.9180908203125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, "epoch": 0.06656994111120594, - "grad_norm": 0.13052046298980713, - "kl": 0.03033447265625, - "learning_rate": 6.643835616438356e-07, - "loss": 0.2095, - "num_tokens": 137617373.0, - "reward": 0.916015625, - "reward_std": 0.33053910732269287, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, + "grad_norm": 0.15445198118686676, + "kl": 0.0128326416015625, + "learning_rate": 6.621160409556313e-07, + "loss": 0.224, + "num_tokens": 167156071.0, + "reward": 0.79931640625, + "reward_std": 0.3117251992225647, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.814453125, - "rewards/tag_count_reward/std": 0.2983911335468292, + "rewards/tag_count_reward/mean": 0.69970703125, + "rewards/tag_count_reward/std": 0.32663094997406006, "step": 195 }, { @@ -5670,27 +5670,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.17578125, + "completions/clipped_ratio": 0.20703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1194.693359375, - "completions/mean_terminated_length": 1012.7085571289062, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1148.3359375, + "completions/mean_terminated_length": 913.4482421875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, "epoch": 0.06691132542459674, - "grad_norm": 0.14570492506027222, - "kl": 0.029083251953125, - "learning_rate": 6.678082191780822e-07, - "loss": 0.1595, - "num_tokens": 138305104.0, - "reward": 0.9814453125, - "reward_std": 0.30250635743141174, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, + "grad_norm": 0.14241255819797516, + "kl": 0.013671875, + "learning_rate": 6.655290102389079e-07, + "loss": 0.1861, + "num_tokens": 167820067.0, + "reward": 0.85986328125, + "reward_std": 0.330346018075943, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8544921875, - "rewards/tag_count_reward/std": 0.2518412172794342, + "rewards/tag_count_reward/mean": 0.75244140625, + "rewards/tag_count_reward/std": 0.29941052198410034, "step": 196 }, { @@ -5699,27 +5699,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1640625, + "completions/clipped_ratio": 0.2734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1146.6015625, - "completions/mean_terminated_length": 969.6915283203125, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1188.43359375, + "completions/mean_terminated_length": 864.9408569335938, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, "epoch": 0.06725270973798754, - "grad_norm": 0.12312240898609161, - "kl": 0.025299072265625, - "learning_rate": 6.712328767123287e-07, - "loss": 0.166, - "num_tokens": 138967364.0, - "reward": 0.9775390625, - "reward_std": 0.31271493434906006, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, + "grad_norm": 0.13246527314186096, + "kl": 0.01220703125, + "learning_rate": 6.689419795221843e-07, + "loss": 0.2363, + "num_tokens": 168503745.0, + "reward": 0.8134765625, + "reward_std": 0.3302205204963684, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8623046875, - "rewards/tag_count_reward/std": 0.25715774297714233, + "rewards/tag_count_reward/mean": 0.7119140625, + "rewards/tag_count_reward/std": 0.31165337562561035, "step": 197 }, { @@ -5728,27 +5728,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.14453125, + "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 1078.677734375, - "completions/mean_terminated_length": 914.9109497070312, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1013.77734375, + "completions/mean_terminated_length": 775.110595703125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, "epoch": 0.06759409405137834, - "grad_norm": 0.12881354987621307, - "kl": 0.027496337890625, - "learning_rate": 6.746575342465753e-07, - "loss": 0.1802, - "num_tokens": 139590975.0, - "reward": 1.03271484375, - "reward_std": 0.3340844511985779, - "rewards/accuracy_reward/mean": 0.1484375, - "rewards/accuracy_reward/std": 0.35588082671165466, + "grad_norm": 0.13493385910987854, + "kl": 0.012359619140625, + "learning_rate": 6.723549488054607e-07, + "loss": 0.2048, + "num_tokens": 169094127.0, + "reward": 0.8671875, + "reward_std": 0.3374782204627991, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88427734375, - "rewards/tag_count_reward/std": 0.24463292956352234, + "rewards/tag_count_reward/mean": 0.771484375, + "rewards/tag_count_reward/std": 0.29306623339653015, "step": 198 }, { @@ -5757,27 +5757,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.15234375, + "completions/clipped_ratio": 0.197265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1096.515625, - "completions/mean_terminated_length": 925.5115356445312, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1079.85546875, + "completions/mean_terminated_length": 841.9415893554688, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, "epoch": 0.06793547836476914, - "grad_norm": 0.14294716715812683, - "kl": 0.02911376953125, - "learning_rate": 6.78082191780822e-07, - "loss": 0.1705, - "num_tokens": 140229703.0, - "reward": 1.00634765625, - "reward_std": 0.29666227102279663, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, + "grad_norm": 0.1415771245956421, + "kl": 0.0149688720703125, + "learning_rate": 6.757679180887371e-07, + "loss": 0.2073, + "num_tokens": 169724325.0, + "reward": 0.89697265625, + "reward_std": 0.3454717993736267, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86572265625, - "rewards/tag_count_reward/std": 0.2539547383785248, + "rewards/tag_count_reward/mean": 0.76220703125, + "rewards/tag_count_reward/std": 0.29753127694129944, "step": 199 }, { @@ -5786,27 +5786,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.205078125, + "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1200.646484375, - "completions/mean_terminated_length": 982.041748046875, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1179.71484375, + "completions/mean_terminated_length": 890.2864990234375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, "epoch": 0.06827686267815994, - "grad_norm": 0.3884568214416504, - "kl": 0.0321044921875, - "learning_rate": 6.815068493150684e-07, - "loss": 0.1919, - "num_tokens": 140912146.0, - "reward": 0.9267578125, - "reward_std": 0.3244363069534302, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, + "grad_norm": 0.1384713351726532, + "kl": 0.0121917724609375, + "learning_rate": 6.791808873720136e-07, + "loss": 0.2496, + "num_tokens": 170396051.0, + "reward": 0.833984375, + "reward_std": 0.33412545919418335, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8271484375, - "rewards/tag_count_reward/std": 0.27830082178115845, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.3092418313026428, "step": 200 }, { @@ -5815,27 +5815,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.177734375, + "completions/clipped_ratio": 0.287109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1189.732421875, - "completions/mean_terminated_length": 1004.2161865234375, - "completions/min_length": 249.0, - "completions/min_terminated_length": 249.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1221.71875, + "completions/mean_terminated_length": 888.9425048828125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.06861824699155074, - "grad_norm": 0.11886925250291824, - "kl": 0.026519775390625, - "learning_rate": 6.84931506849315e-07, - "loss": 0.1603, - "num_tokens": 141598809.0, - "reward": 0.94677734375, - "reward_std": 0.27545297145843506, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, + "grad_norm": 0.12870904803276062, + "kl": 0.0127105712890625, + "learning_rate": 6.825938566552901e-07, + "loss": 0.2314, + "num_tokens": 171099091.0, + "reward": 0.7841796875, + "reward_std": 0.33621302247047424, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85302734375, - "rewards/tag_count_reward/std": 0.26543471217155457, + "rewards/tag_count_reward/mean": 0.7021484375, + "rewards/tag_count_reward/std": 0.32681119441986084, "step": 201 }, { @@ -5844,27 +5844,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1875, + "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 1105.384765625, - "completions/mean_terminated_length": 887.8582153320312, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1204.849609375, + "completions/mean_terminated_length": 848.852783203125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, "epoch": 0.06895963130494154, - "grad_norm": 0.1469177007675171, - "kl": 0.03057861328125, - "learning_rate": 6.883561643835616e-07, - "loss": 0.1918, - "num_tokens": 142248910.0, - "reward": 1.02099609375, - "reward_std": 0.32276904582977295, - "rewards/accuracy_reward/mean": 0.17578125, - "rewards/accuracy_reward/std": 0.3810062110424042, + "grad_norm": 0.14517195522785187, + "kl": 0.014617919921875, + "learning_rate": 6.860068259385665e-07, + "loss": 0.2354, + "num_tokens": 171800118.0, + "reward": 0.845703125, + "reward_std": 0.37770742177963257, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84521484375, - "rewards/tag_count_reward/std": 0.2728618085384369, + "rewards/tag_count_reward/mean": 0.697265625, + "rewards/tag_count_reward/std": 0.3241764307022095, "step": 202 }, { @@ -5873,27 +5873,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.130859375, + "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 1059.806640625, - "completions/mean_terminated_length": 911.0224609375, - "completions/min_length": 88.0, - "completions/min_terminated_length": 88.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1061.5, + "completions/mean_terminated_length": 878.8148193359375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, "epoch": 0.06930101561833234, - "grad_norm": 0.16338153183460236, - "kl": 0.03179931640625, - "learning_rate": 6.917808219178081e-07, - "loss": 0.1605, - "num_tokens": 142869899.0, - "reward": 0.974609375, - "reward_std": 0.302266001701355, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, + "grad_norm": 0.14468716084957123, + "kl": 0.0157623291015625, + "learning_rate": 6.89419795221843e-07, + "loss": 0.2042, + "num_tokens": 172421974.0, + "reward": 0.8896484375, + "reward_std": 0.33229973912239075, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.873046875, - "rewards/tag_count_reward/std": 0.2541169822216034, + "rewards/tag_count_reward/mean": 0.7861328125, + "rewards/tag_count_reward/std": 0.2882450520992279, "step": 203 }, { @@ -5902,27 +5902,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.126953125, + "completions/clipped_ratio": 0.1953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 973.978515625, - "completions/mean_terminated_length": 817.8009033203125, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 980.158203125, + "completions/mean_terminated_length": 720.9733276367188, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, "epoch": 0.06964239993172314, - "grad_norm": 0.16850519180297852, - "kl": 0.035400390625, - "learning_rate": 6.952054794520548e-07, - "loss": 0.1791, - "num_tokens": 143440848.0, - "reward": 1.0654296875, - "reward_std": 0.3052448332309723, - "rewards/accuracy_reward/mean": 0.185546875, - "rewards/accuracy_reward/std": 0.38912075757980347, + "grad_norm": 0.18174254894256592, + "kl": 0.0180206298828125, + "learning_rate": 6.928327645051194e-07, + "loss": 0.254, + "num_tokens": 172996087.0, + "reward": 0.9267578125, + "reward_std": 0.3677091598510742, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8798828125, - "rewards/tag_count_reward/std": 0.25214457511901855, + "rewards/tag_count_reward/mean": 0.7724609375, + "rewards/tag_count_reward/std": 0.2942425310611725, "step": 204 }, { @@ -5931,27 +5931,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.193359375, + "completions/clipped_ratio": 0.275390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1150.9921875, - "completions/mean_terminated_length": 935.9710083007812, - "completions/min_length": 179.0, - "completions/min_terminated_length": 179.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1194.7421875, + "completions/mean_terminated_length": 870.4581909179688, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, "epoch": 0.06998378424511394, - "grad_norm": 0.11850355565547943, - "kl": 0.026214599609375, - "learning_rate": 6.986301369863014e-07, - "loss": 0.1802, - "num_tokens": 144103548.0, - "reward": 0.9375, - "reward_std": 0.3141724467277527, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, + "grad_norm": 0.14662407338619232, + "kl": 0.014312744140625, + "learning_rate": 6.962457337883959e-07, + "loss": 0.2183, + "num_tokens": 173681187.0, + "reward": 0.79736328125, + "reward_std": 0.3289518654346466, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.845703125, - "rewards/tag_count_reward/std": 0.2778007984161377, + "rewards/tag_count_reward/mean": 0.72509765625, + "rewards/tag_count_reward/std": 0.3170620799064636, "step": 205 }, { @@ -5960,27 +5960,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.16015625, + "completions/clipped_ratio": 0.212890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1131.931640625, - "completions/mean_terminated_length": 957.239501953125, - "completions/min_length": 261.0, - "completions/min_terminated_length": 261.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1109.548828125, + "completions/mean_terminated_length": 855.7245483398438, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, "epoch": 0.07032516855850474, - "grad_norm": 0.13232004642486572, - "kl": 0.029022216796875, - "learning_rate": 7.02054794520548e-07, - "loss": 0.1503, - "num_tokens": 144768345.0, - "reward": 0.9990234375, - "reward_std": 0.3439529538154602, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, + "grad_norm": 0.14206916093826294, + "kl": 0.0142822265625, + "learning_rate": 6.996587030716723e-07, + "loss": 0.1932, + "num_tokens": 174334524.0, + "reward": 0.8896484375, + "reward_std": 0.3636188507080078, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8720703125, - "rewards/tag_count_reward/std": 0.2531430721282959, + "rewards/tag_count_reward/mean": 0.7587890625, + "rewards/tag_count_reward/std": 0.29785749316215515, "step": 206 }, { @@ -5989,27 +5989,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1484375, + "completions/clipped_ratio": 0.212890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 1070.21484375, - "completions/mean_terminated_length": 899.7752075195312, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1090.5234375, + "completions/mean_terminated_length": 831.5533447265625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.07066655287189554, - "grad_norm": 0.13105420768260956, - "kl": 0.030609130859375, - "learning_rate": 7.054794520547945e-07, - "loss": 0.1305, - "num_tokens": 145391687.0, - "reward": 0.9775390625, - "reward_std": 0.2806186378002167, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, + "grad_norm": 0.14183954894542694, + "kl": 0.0157623291015625, + "learning_rate": 7.030716723549488e-07, + "loss": 0.1804, + "num_tokens": 174968264.0, + "reward": 0.86279296875, + "reward_std": 0.29027214646339417, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8740234375, - "rewards/tag_count_reward/std": 0.255562424659729, + "rewards/tag_count_reward/mean": 0.77490234375, + "rewards/tag_count_reward/std": 0.30964595079421997, "step": 207 }, { @@ -6018,27 +6018,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.146484375, + "completions/clipped_ratio": 0.1640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 1061.9375, - "completions/mean_terminated_length": 892.7047729492188, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1038.671875, + "completions/mean_terminated_length": 840.5794067382812, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, "epoch": 0.07100793718528634, - "grad_norm": 0.28645065426826477, - "kl": 0.03521728515625, - "learning_rate": 7.08904109589041e-07, - "loss": 0.1834, - "num_tokens": 146008183.0, - "reward": 1.04541015625, - "reward_std": 0.308370977640152, - "rewards/accuracy_reward/mean": 0.17578125, - "rewards/accuracy_reward/std": 0.3810062110424042, + "grad_norm": 0.15344756841659546, + "kl": 0.0158538818359375, + "learning_rate": 7.064846416382251e-07, + "loss": 0.14, + "num_tokens": 175572848.0, + "reward": 0.9365234375, + "reward_std": 0.3601889908313751, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86962890625, - "rewards/tag_count_reward/std": 0.25213465094566345, + "rewards/tag_count_reward/mean": 0.7802734375, + "rewards/tag_count_reward/std": 0.284225732088089, "step": 208 }, { @@ -6047,27 +6047,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.166015625, + "completions/clipped_ratio": 0.21484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 1117.779296875, - "completions/mean_terminated_length": 932.6065063476562, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1102.173828125, + "completions/mean_terminated_length": 843.3656616210938, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.07134932149867713, - "grad_norm": 0.14213435351848602, - "kl": 0.0330810546875, - "learning_rate": 7.123287671232876e-07, - "loss": 0.182, - "num_tokens": 146655126.0, - "reward": 1.01025390625, - "reward_std": 0.3315596878528595, - "rewards/accuracy_reward/mean": 0.162109375, - "rewards/accuracy_reward/std": 0.3689115643501282, + "grad_norm": 0.16120149195194244, + "kl": 0.01666259765625, + "learning_rate": 7.098976109215017e-07, + "loss": 0.2597, + "num_tokens": 176211801.0, + "reward": 0.865234375, + "reward_std": 0.3496299386024475, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84814453125, - "rewards/tag_count_reward/std": 0.2758397161960602, + "rewards/tag_count_reward/mean": 0.736328125, + "rewards/tag_count_reward/std": 0.3109366297721863, "step": 209 }, { @@ -6076,27 +6076,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.146484375, + "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1069.072265625, - "completions/mean_terminated_length": 901.0640258789062, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1073.322265625, + "completions/mean_terminated_length": 800.4124755859375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, "epoch": 0.07169070581206793, - "grad_norm": 0.15966561436653137, - "kl": 0.03521728515625, - "learning_rate": 7.157534246575342e-07, - "loss": 0.2082, - "num_tokens": 147271755.0, - "reward": 0.92529296875, - "reward_std": 0.29846563935279846, - "rewards/accuracy_reward/mean": 0.06451612710952759, - "rewards/accuracy_reward/std": 0.2459181249141693, + "grad_norm": 0.15534524619579315, + "kl": 0.014801025390625, + "learning_rate": 7.133105802047781e-07, + "loss": 0.2832, + "num_tokens": 176830606.0, + "reward": 0.8349609375, + "reward_std": 0.32635965943336487, + "rewards/accuracy_reward/mean": 0.060483869165182114, + "rewards/accuracy_reward/std": 0.2386218160390854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86279296875, - "rewards/tag_count_reward/std": 0.2623606026172638, + "rewards/tag_count_reward/mean": 0.7763671875, + "rewards/tag_count_reward/std": 0.30172058939933777, "step": 210 }, { @@ -6105,27 +6105,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.14453125, + "completions/clipped_ratio": 0.173828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1088.546875, - "completions/mean_terminated_length": 926.4474487304688, - "completions/min_length": 203.0, - "completions/min_terminated_length": 203.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1014.869140625, + "completions/mean_terminated_length": 797.4964599609375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.07203209012545873, - "grad_norm": 0.1488552987575531, - "kl": 0.0316162109375, - "learning_rate": 7.191780821917808e-07, - "loss": 0.1847, - "num_tokens": 147906131.0, - "reward": 1.0498046875, - "reward_std": 0.3397788405418396, - "rewards/accuracy_reward/mean": 0.189453125, - "rewards/accuracy_reward/std": 0.3922513723373413, + "grad_norm": 0.15638583898544312, + "kl": 0.0158538818359375, + "learning_rate": 7.167235494880546e-07, + "loss": 0.2694, + "num_tokens": 177427259.0, + "reward": 0.95703125, + "reward_std": 0.39338618516921997, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8603515625, - "rewards/tag_count_reward/std": 0.26176854968070984, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.29486700892448425, "step": 211 }, { @@ -6134,27 +6134,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.15234375, + "completions/clipped_ratio": 0.263671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1117.904296875, - "completions/mean_terminated_length": 950.7442626953125, - "completions/min_length": 208.0, - "completions/min_terminated_length": 208.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1199.0625, + "completions/mean_terminated_length": 895.0662841796875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, "epoch": 0.07237347443884953, - "grad_norm": 0.8496732711791992, - "kl": 0.05059814453125, - "learning_rate": 7.226027397260274e-07, - "loss": 0.1585, - "num_tokens": 148560514.0, - "reward": 0.95263671875, - "reward_std": 0.3026370406150818, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, + "grad_norm": 0.13579830527305603, + "kl": 0.01416015625, + "learning_rate": 7.201365187713311e-07, + "loss": 0.2345, + "num_tokens": 178123195.0, + "reward": 0.78271484375, + "reward_std": 0.3598169982433319, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85888671875, - "rewards/tag_count_reward/std": 0.26308074593544006, + "rewards/tag_count_reward/mean": 0.70849609375, + "rewards/tag_count_reward/std": 0.31647688150405884, "step": 212 }, { @@ -6163,27 +6163,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.166015625, + "completions/clipped_ratio": 0.310546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 1193.779296875, - "completions/mean_terminated_length": 1023.7352905273438, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1289.328125, + "completions/mean_terminated_length": 947.6033935546875, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, "epoch": 0.07271485875224033, - "grad_norm": 0.1313561499118805, - "kl": 0.026763916015625, - "learning_rate": 7.260273972602739e-07, - "loss": 0.18, - "num_tokens": 149258769.0, - "reward": 0.9873046875, - "reward_std": 0.33825457096099854, - "rewards/accuracy_reward/mean": 0.134765625, - "rewards/accuracy_reward/std": 0.3418070077896118, + "grad_norm": 0.1423669010400772, + "kl": 0.0150604248046875, + "learning_rate": 7.235494880546075e-07, + "loss": 0.2477, + "num_tokens": 178870371.0, + "reward": 0.8017578125, + "reward_std": 0.35919225215911865, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8525390625, - "rewards/tag_count_reward/std": 0.26905539631843567, + "rewards/tag_count_reward/mean": 0.6982421875, + "rewards/tag_count_reward/std": 0.3163156807422638, "step": 213 }, { @@ -6192,27 +6192,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.099609375, + "completions/clipped_ratio": 0.142578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1982.0, - "completions/mean_length": 1029.296875, - "completions/mean_terminated_length": 916.5986938476562, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 991.017578125, + "completions/mean_terminated_length": 815.2551879882812, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.07305624306563113, - "grad_norm": 0.1539052575826645, - "kl": 0.032440185546875, - "learning_rate": 7.294520547945205e-07, - "loss": 0.1719, - "num_tokens": 149857833.0, - "reward": 0.92626953125, - "reward_std": 0.24862925708293915, - "rewards/accuracy_reward/mean": 0.032258063554763794, - "rewards/accuracy_reward/std": 0.17686307430267334, + "grad_norm": 0.1522628217935562, + "kl": 0.017059326171875, + "learning_rate": 7.269624573378839e-07, + "loss": 0.265, + "num_tokens": 179449836.0, + "reward": 0.83984375, + "reward_std": 0.2854743003845215, + "rewards/accuracy_reward/mean": 0.026209676638245583, + "rewards/accuracy_reward/std": 0.1599196344614029, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89501953125, - "rewards/tag_count_reward/std": 0.2284543514251709, + "rewards/tag_count_reward/mean": 0.814453125, + "rewards/tag_count_reward/std": 0.28108346462249756, "step": 214 }, { @@ -6221,27 +6221,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.142578125, + "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1090.591796875, - "completions/mean_terminated_length": 931.3872680664062, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1168.271484375, + "completions/mean_terminated_length": 875.0286865234375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, "epoch": 0.07339762737902193, - "grad_norm": 0.13333448767662048, - "kl": 0.03228759765625, - "learning_rate": 7.328767123287672e-07, - "loss": 0.1463, - "num_tokens": 150495608.0, - "reward": 0.96533203125, - "reward_std": 0.2588098347187042, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, + "grad_norm": 0.13755033910274506, + "kl": 0.01641845703125, + "learning_rate": 7.303754266211603e-07, + "loss": 0.2554, + "num_tokens": 180127383.0, + "reward": 0.8310546875, + "reward_std": 0.32114607095718384, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86962890625, - "rewards/tag_count_reward/std": 0.25978022813796997, + "rewards/tag_count_reward/mean": 0.7412109375, + "rewards/tag_count_reward/std": 0.3115061819553375, "step": 215 }, { @@ -6250,27 +6250,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.142578125, + "completions/clipped_ratio": 0.185546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 1102.90234375, - "completions/mean_terminated_length": 945.7449340820312, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1150.740234375, + "completions/mean_terminated_length": 946.3285522460938, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.07373901169241273, - "grad_norm": 0.15049462020397186, - "kl": 0.03265380859375, - "learning_rate": 7.363013698630136e-07, - "loss": 0.1798, - "num_tokens": 151135670.0, - "reward": 1.0068359375, - "reward_std": 0.30560004711151123, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, + "grad_norm": 0.12239421904087067, + "kl": 0.0146026611328125, + "learning_rate": 7.337883959044369e-07, + "loss": 0.2139, + "num_tokens": 180791938.0, + "reward": 0.9150390625, + "reward_std": 0.34235459566116333, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8759765625, - "rewards/tag_count_reward/std": 0.2579442858695984, + "rewards/tag_count_reward/mean": 0.7919921875, + "rewards/tag_count_reward/std": 0.2904115617275238, "step": 216 }, { @@ -6279,27 +6279,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.13671875, + "completions/clipped_ratio": 0.1953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 1070.8828125, - "completions/mean_terminated_length": 916.1358032226562, - "completions/min_length": 182.0, - "completions/min_terminated_length": 182.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1045.814453125, + "completions/mean_terminated_length": 802.5655517578125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, "epoch": 0.07408039600580353, - "grad_norm": 0.13311637938022614, - "kl": 0.028564453125, - "learning_rate": 7.397260273972602e-07, - "loss": 0.1435, - "num_tokens": 151760234.0, - "reward": 1.0458984375, - "reward_std": 0.30893051624298096, - "rewards/accuracy_reward/mean": 0.16796875, - "rewards/accuracy_reward/std": 0.374204158782959, + "grad_norm": 0.16206489503383636, + "kl": 0.0171661376953125, + "learning_rate": 7.372013651877133e-07, + "loss": 0.2219, + "num_tokens": 181403667.0, + "reward": 0.9501953125, + "reward_std": 0.34202441573143005, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8779296875, - "rewards/tag_count_reward/std": 0.2536257803440094, + "rewards/tag_count_reward/mean": 0.7998046875, + "rewards/tag_count_reward/std": 0.2954481244087219, "step": 217 }, { @@ -6308,27 +6308,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.146484375, + "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1083.294921875, - "completions/mean_terminated_length": 917.7276611328125, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1105.482421875, + "completions/mean_terminated_length": 841.5774536132812, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, "epoch": 0.07442178031919433, - "grad_norm": 0.15257856249809265, - "kl": 0.03009033203125, - "learning_rate": 7.431506849315068e-07, - "loss": 0.2131, - "num_tokens": 152389297.0, - "reward": 1.00390625, - "reward_std": 0.2963552474975586, - "rewards/accuracy_reward/mean": 0.13671875, - "rewards/accuracy_reward/std": 0.3438861668109894, + "grad_norm": 0.21797847747802734, + "kl": 0.02508544921875, + "learning_rate": 7.406143344709898e-07, + "loss": 0.2408, + "num_tokens": 182044090.0, + "reward": 0.9013671875, + "reward_std": 0.3440747857093811, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8671875, - "rewards/tag_count_reward/std": 0.25544461607933044, + "rewards/tag_count_reward/mean": 0.7666015625, + "rewards/tag_count_reward/std": 0.30241650342941284, "step": 218 }, { @@ -6337,27 +6337,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.140625, + "completions/clipped_ratio": 0.197265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 1074.134765625, - "completions/mean_terminated_length": 914.7749633789062, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1082.025390625, + "completions/mean_terminated_length": 844.644775390625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, "epoch": 0.07476316463258513, - "grad_norm": 0.1511964648962021, - "kl": 0.030914306640625, - "learning_rate": 7.465753424657533e-07, - "loss": 0.2, - "num_tokens": 153029862.0, - "reward": 0.98779296875, - "reward_std": 0.3228898048400879, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, + "grad_norm": 0.13452063500881195, + "kl": 0.014129638671875, + "learning_rate": 7.440273037542661e-07, + "loss": 0.2309, + "num_tokens": 182688695.0, + "reward": 0.88037109375, + "reward_std": 0.3124554753303528, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87451171875, - "rewards/tag_count_reward/std": 0.25315961241722107, + "rewards/tag_count_reward/mean": 0.77294921875, + "rewards/tag_count_reward/std": 0.2985379993915558, "step": 219 }, { @@ -6366,27 +6366,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.107421875, + "completions/clipped_ratio": 0.208984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1098.11328125, - "completions/mean_terminated_length": 983.7943115234375, - "completions/min_length": 236.0, - "completions/min_terminated_length": 236.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1136.6875, + "completions/mean_terminated_length": 895.9210205078125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.07510454894597593, - "grad_norm": 0.13734576106071472, - "kl": 0.027099609375, - "learning_rate": 7.5e-07, - "loss": 0.172, - "num_tokens": 153673248.0, - "reward": 0.97900390625, - "reward_std": 0.26450347900390625, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 0.13103114068508148, + "kl": 0.0148773193359375, + "learning_rate": 7.474402730375426e-07, + "loss": 0.2316, + "num_tokens": 183351831.0, + "reward": 0.86279296875, + "reward_std": 0.3280552327632904, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.21513772010803223, + "rewards/tag_count_reward/mean": 0.78076171875, + "rewards/tag_count_reward/std": 0.30832546949386597, "step": 220 }, { @@ -6395,27 +6395,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.138671875, + "completions/clipped_ratio": 0.1640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1134.908203125, - "completions/mean_terminated_length": 987.9025268554688, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1070.05859375, + "completions/mean_terminated_length": 878.1261596679688, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.07544593325936673, - "grad_norm": 0.13219542801380157, - "kl": 0.027252197265625, - "learning_rate": 7.534246575342466e-07, - "loss": 0.1591, - "num_tokens": 154327041.0, - "reward": 1.0224609375, - "reward_std": 0.29937922954559326, - "rewards/accuracy_reward/mean": 0.15234375, - "rewards/accuracy_reward/std": 0.35970520973205566, + "grad_norm": 0.1350395828485489, + "kl": 0.0158843994140625, + "learning_rate": 7.50853242320819e-07, + "loss": 0.2211, + "num_tokens": 183972421.0, + "reward": 0.92626953125, + "reward_std": 0.3565624952316284, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8701171875, - "rewards/tag_count_reward/std": 0.24774044752120972, + "rewards/tag_count_reward/mean": 0.77978515625, + "rewards/tag_count_reward/std": 0.28577929735183716, "step": 221 }, { @@ -6424,27 +6424,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.14453125, + "completions/clipped_ratio": 0.169921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 1079.619140625, - "completions/mean_terminated_length": 916.0113525390625, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1053.357421875, + "completions/mean_terminated_length": 849.7482299804688, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.07578731757275753, - "grad_norm": 0.1405760943889618, - "kl": 0.03057861328125, - "learning_rate": 7.568493150684932e-07, - "loss": 0.2047, - "num_tokens": 154958494.0, - "reward": 1.04248046875, - "reward_std": 0.28535062074661255, - "rewards/accuracy_reward/mean": 0.158203125, - "rewards/accuracy_reward/std": 0.36528825759887695, + "grad_norm": 0.14394913613796234, + "kl": 0.0170440673828125, + "learning_rate": 7.542662116040955e-07, + "loss": 0.1876, + "num_tokens": 184590428.0, + "reward": 0.96044921875, + "reward_std": 0.364574134349823, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88427734375, - "rewards/tag_count_reward/std": 0.2385578453540802, + "rewards/tag_count_reward/mean": 0.78857421875, + "rewards/tag_count_reward/std": 0.2830006182193756, "step": 222 }, { @@ -6453,27 +6453,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1484375, + "completions/clipped_ratio": 0.185546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 1130.537109375, - "completions/mean_terminated_length": 970.6123657226562, - "completions/min_length": 81.0, - "completions/min_terminated_length": 81.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1074.298828125, + "completions/mean_terminated_length": 852.472412109375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.07612870188614833, - "grad_norm": 0.13484491407871246, - "kl": 0.031005859375, - "learning_rate": 7.602739726027397e-07, - "loss": 0.1306, - "num_tokens": 155615889.0, - "reward": 1.099609375, - "reward_std": 0.3330080509185791, + "grad_norm": 0.15169014036655426, + "kl": 0.0179595947265625, + "learning_rate": 7.57679180887372e-07, + "loss": 0.2147, + "num_tokens": 185219029.0, + "reward": 1.0185546875, + "reward_std": 0.3537110388278961, "rewards/accuracy_reward/mean": 0.21484375, "rewards/accuracy_reward/std": 0.4111155867576599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.884765625, - "rewards/tag_count_reward/std": 0.2446138858795166, + "rewards/tag_count_reward/mean": 0.8037109375, + "rewards/tag_count_reward/std": 0.29309719800949097, "step": 223 }, { @@ -6482,27 +6482,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.146484375, + "completions/clipped_ratio": 0.193359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 1042.552734375, - "completions/mean_terminated_length": 869.9931030273438, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1050.001953125, + "completions/mean_terminated_length": 810.7724609375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, "epoch": 0.07647008619953913, - "grad_norm": 0.1428431272506714, - "kl": 0.029296875, - "learning_rate": 7.636986301369863e-07, - "loss": 0.1659, - "num_tokens": 156226780.0, - "reward": 0.94140625, - "reward_std": 0.2550290822982788, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 0.150232195854187, + "kl": 0.01727294921875, + "learning_rate": 7.610921501706485e-07, + "loss": 0.2529, + "num_tokens": 185833734.0, + "reward": 0.83984375, + "reward_std": 0.3130366802215576, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.875, - "rewards/tag_count_reward/std": 0.2565196752548218, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3004145324230194, "step": 224 }, { @@ -6511,27 +6511,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1484375, + "completions/clipped_ratio": 0.197265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1103.482421875, - "completions/mean_terminated_length": 938.8416748046875, - "completions/min_length": 197.0, - "completions/min_terminated_length": 197.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1091.376953125, + "completions/mean_terminated_length": 856.2943725585938, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, "epoch": 0.07681147051292993, - "grad_norm": 0.14843755960464478, - "kl": 0.029388427734375, - "learning_rate": 7.671232876712328e-07, - "loss": 0.1851, - "num_tokens": 156872675.0, - "reward": 0.99462890625, - "reward_std": 0.3229602575302124, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, + "grad_norm": 0.14438505470752716, + "kl": 0.0169677734375, + "learning_rate": 7.645051194539249e-07, + "loss": 0.2371, + "num_tokens": 186473431.0, + "reward": 0.880859375, + "reward_std": 0.3424620032310486, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86572265625, - "rewards/tag_count_reward/std": 0.25730398297309875, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3019627630710602, "step": 225 }, { @@ -6540,27 +6540,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08984375, + "completions/clipped_ratio": 0.126953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 960.49609375, - "completions/mean_terminated_length": 853.1459350585938, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 942.357421875, + "completions/mean_terminated_length": 781.5816650390625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, "epoch": 0.07715285482632073, - "grad_norm": 0.16743634641170502, - "kl": 0.033355712890625, - "learning_rate": 7.705479452054794e-07, - "loss": 0.1358, - "num_tokens": 157447953.0, - "reward": 1.10009765625, - "reward_std": 0.2929975390434265, - "rewards/accuracy_reward/mean": 0.1895161271095276, - "rewards/accuracy_reward/std": 0.39231374859809875, + "grad_norm": 0.19273056089878082, + "kl": 0.023223876953125, + "learning_rate": 7.679180887372013e-07, + "loss": 0.1726, + "num_tokens": 187039422.0, + "reward": 0.97119140625, + "reward_std": 0.3649890422821045, + "rewards/accuracy_reward/mean": 0.15322580933570862, + "rewards/accuracy_reward/std": 0.36056873202323914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.20922017097473145, + "rewards/tag_count_reward/mean": 0.82275390625, + "rewards/tag_count_reward/std": 0.26945772767066956, "step": 226 }, { @@ -6569,27 +6569,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.201171875, + "completions/clipped_ratio": 0.212890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 1157.2578125, - "completions/mean_terminated_length": 932.9389038085938, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1111.291015625, + "completions/mean_terminated_length": 857.93798828125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, "epoch": 0.07749423913971153, - "grad_norm": 0.14617608487606049, - "kl": 0.0316162109375, - "learning_rate": 7.73972602739726e-07, - "loss": 0.1841, - "num_tokens": 158119637.0, - "reward": 0.91748046875, - "reward_std": 0.332801878452301, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, + "grad_norm": 0.14280608296394348, + "kl": 0.0173187255859375, + "learning_rate": 7.713310580204778e-07, + "loss": 0.3246, + "num_tokens": 187687571.0, + "reward": 0.8671875, + "reward_std": 0.34099018573760986, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83154296875, - "rewards/tag_count_reward/std": 0.29562392830848694, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3079785406589508, "step": 227 }, { @@ -6598,27 +6598,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1640625, + "completions/clipped_ratio": 0.17578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 1132.5859375, - "completions/mean_terminated_length": 952.9252319335938, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1069.025390625, + "completions/mean_terminated_length": 860.2393798828125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, "epoch": 0.07783562345310233, - "grad_norm": 1.4102917909622192, - "kl": 0.04827880859375, - "learning_rate": 7.773972602739726e-07, - "loss": 0.1814, - "num_tokens": 158770721.0, - "reward": 1.0283203125, - "reward_std": 0.34487906098365784, - "rewards/accuracy_reward/mean": 0.158203125, - "rewards/accuracy_reward/std": 0.36528825759887695, + "grad_norm": 0.1343468725681305, + "kl": 0.017791748046875, + "learning_rate": 7.747440273037542e-07, + "loss": 0.2295, + "num_tokens": 188306112.0, + "reward": 0.93408203125, + "reward_std": 0.3749796450138092, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8701171875, - "rewards/tag_count_reward/std": 0.25262919068336487, + "rewards/tag_count_reward/mean": 0.79736328125, + "rewards/tag_count_reward/std": 0.28767555952072144, "step": 228 }, { @@ -6627,27 +6627,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.189453125, + "completions/clipped_ratio": 0.220703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, - "completions/mean_length": 1147.451171875, - "completions/mean_terminated_length": 936.9614868164062, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/mean_length": 1106.78515625, + "completions/mean_terminated_length": 840.2255859375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, "epoch": 0.07817700776649313, - "grad_norm": 0.1398928165435791, - "kl": 0.030181884765625, - "learning_rate": 7.808219178082191e-07, - "loss": 0.1918, - "num_tokens": 159442840.0, - "reward": 1.0068359375, - "reward_std": 0.36788713932037354, - "rewards/accuracy_reward/mean": 0.16935484111309052, - "rewards/accuracy_reward/std": 0.3754436671733856, + "grad_norm": 0.12399096041917801, + "kl": 0.016082763671875, + "learning_rate": 7.781569965870307e-07, + "loss": 0.2438, + "num_tokens": 188957410.0, + "reward": 0.92919921875, + "reward_std": 0.38114863634109497, + "rewards/accuracy_reward/mean": 0.15322580933570862, + "rewards/accuracy_reward/std": 0.36056873202323914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8427734375, - "rewards/tag_count_reward/std": 0.27659255266189575, + "rewards/tag_count_reward/mean": 0.78076171875, + "rewards/tag_count_reward/std": 0.3059360682964325, "step": 229 }, { @@ -6656,27 +6656,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2109375, + "completions/clipped_ratio": 0.224609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 1174.890625, - "completions/mean_terminated_length": 941.485107421875, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1131.03515625, + "completions/mean_terminated_length": 865.4155883789062, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, "epoch": 0.07851839207988393, - "grad_norm": 0.15699157118797302, - "kl": 0.0335693359375, - "learning_rate": 7.842465753424657e-07, - "loss": 0.2296, - "num_tokens": 160128288.0, - "reward": 0.8662109375, - "reward_std": 0.31422632932662964, - "rewards/accuracy_reward/mean": 0.04435483738780022, - "rewards/accuracy_reward/std": 0.2060900777578354, + "grad_norm": 0.1489233374595642, + "kl": 0.018310546875, + "learning_rate": 7.815699658703071e-07, + "loss": 0.2707, + "num_tokens": 189620404.0, + "reward": 0.79296875, + "reward_std": 0.3181414306163788, + "rewards/accuracy_reward/mean": 0.038306452333927155, + "rewards/accuracy_reward/std": 0.19212883710861206, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8232421875, - "rewards/tag_count_reward/std": 0.2959393262863159, + "rewards/tag_count_reward/mean": 0.755859375, + "rewards/tag_count_reward/std": 0.3103953003883362, "step": 230 }, { @@ -6685,27 +6685,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.22265625, + "completions/clipped_ratio": 0.197265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1187.904296875, - "completions/mean_terminated_length": 941.5452270507812, - "completions/min_length": 182.0, - "completions/min_terminated_length": 182.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1049.0625, + "completions/mean_terminated_length": 803.5814819335938, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, "epoch": 0.07885977639327472, - "grad_norm": 0.14857465028762817, - "kl": 0.03204345703125, - "learning_rate": 7.876712328767124e-07, - "loss": 0.18, - "num_tokens": 160812847.0, - "reward": 0.95654296875, - "reward_std": 0.3289763033390045, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, + "grad_norm": 0.15656845271587372, + "kl": 0.01873779296875, + "learning_rate": 7.849829351535836e-07, + "loss": 0.2422, + "num_tokens": 190233876.0, + "reward": 0.900390625, + "reward_std": 0.3332728445529938, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.82373046875, - "rewards/tag_count_reward/std": 0.2914441227912903, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.29548853635787964, "step": 231 }, { @@ -6714,27 +6714,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.119140625, + "completions/clipped_ratio": 0.1484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1065.884765625, - "completions/mean_terminated_length": 933.0487670898438, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1015.703125, + "completions/mean_terminated_length": 835.7614135742188, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, "epoch": 0.07920116070666552, - "grad_norm": 0.13972456753253937, - "kl": 0.03094482421875, - "learning_rate": 7.91095890410959e-07, - "loss": 0.1981, - "num_tokens": 161438404.0, - "reward": 0.96533203125, - "reward_std": 0.24260011315345764, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, + "grad_norm": 0.1452239751815796, + "kl": 0.018707275390625, + "learning_rate": 7.8839590443686e-07, + "loss": 0.2323, + "num_tokens": 190833740.0, + "reward": 0.8779296875, + "reward_std": 0.29707878828048706, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89501953125, - "rewards/tag_count_reward/std": 0.23217174410820007, + "rewards/tag_count_reward/mean": 0.8037109375, + "rewards/tag_count_reward/std": 0.28462886810302734, "step": 232 }, { @@ -6743,27 +6743,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1875, + "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 1175.013671875, - "completions/mean_terminated_length": 973.5552978515625, - "completions/min_length": 240.0, - "completions/min_terminated_length": 240.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1084.890625, + "completions/mean_terminated_length": 885.0, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, "epoch": 0.07954254502005632, - "grad_norm": 0.7790918946266174, - "kl": 0.036651611328125, - "learning_rate": 7.945205479452054e-07, - "loss": 0.1494, - "num_tokens": 162115835.0, - "reward": 1.0009765625, - "reward_std": 0.34966665506362915, - "rewards/accuracy_reward/mean": 0.16015625, - "rewards/accuracy_reward/std": 0.3671095669269562, + "grad_norm": 0.15919525921344757, + "kl": 0.019744873046875, + "learning_rate": 7.918088737201365e-07, + "loss": 0.2261, + "num_tokens": 191465028.0, + "reward": 0.95654296875, + "reward_std": 0.3967236280441284, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8408203125, - "rewards/tag_count_reward/std": 0.2863559424877167, + "rewards/tag_count_reward/mean": 0.80419921875, + "rewards/tag_count_reward/std": 0.2915424704551697, "step": 233 }, { @@ -6772,27 +6772,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12890625, + "completions/clipped_ratio": 0.173828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1065.4765625, - "completions/mean_terminated_length": 920.0807495117188, - "completions/min_length": 31.0, - "completions/min_terminated_length": 31.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1050.771484375, + "completions/mean_terminated_length": 840.9526977539062, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, "epoch": 0.07988392933344712, - "grad_norm": 0.18480327725410461, - "kl": 0.0311279296875, - "learning_rate": 7.97945205479452e-07, - "loss": 0.2279, - "num_tokens": 162742239.0, - "reward": 0.95263671875, - "reward_std": 0.2685253620147705, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, + "grad_norm": 0.16227871179580688, + "kl": 0.019683837890625, + "learning_rate": 7.952218430034129e-07, + "loss": 0.2561, + "num_tokens": 192083903.0, + "reward": 0.84716796875, + "reward_std": 0.2716546058654785, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88818359375, - "rewards/tag_count_reward/std": 0.23107852041721344, + "rewards/tag_count_reward/mean": 0.81201171875, + "rewards/tag_count_reward/std": 0.28615689277648926, "step": 234 }, { @@ -6801,27 +6801,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.189453125, + "completions/clipped_ratio": 0.2265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 1140.896484375, - "completions/mean_terminated_length": 928.8746948242188, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1104.4921875, + "completions/mean_terminated_length": 828.111083984375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, "epoch": 0.08022531364683792, - "grad_norm": 0.11098338663578033, - "kl": 0.02996826171875, - "learning_rate": 8.013698630136985e-07, - "loss": 0.1437, - "num_tokens": 163399050.0, - "reward": 0.99462890625, - "reward_std": 0.3432961106300354, - "rewards/accuracy_reward/mean": 0.1484375, - "rewards/accuracy_reward/std": 0.35588082671165466, + "grad_norm": 0.13913939893245697, + "kl": 0.0223388671875, + "learning_rate": 7.986348122866893e-07, + "loss": 0.2385, + "num_tokens": 192722075.0, + "reward": 0.88525390625, + "reward_std": 0.3522019386291504, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84619140625, - "rewards/tag_count_reward/std": 0.2813514769077301, + "rewards/tag_count_reward/mean": 0.77197265625, + "rewards/tag_count_reward/std": 0.30429190397262573, "step": 235 }, { @@ -6830,27 +6830,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.115234375, + "completions/clipped_ratio": 0.119140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 1028.30078125, - "completions/mean_terminated_length": 895.4922485351562, - "completions/min_length": 205.0, - "completions/min_terminated_length": 205.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 953.42578125, + "completions/mean_terminated_length": 805.379150390625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, "epoch": 0.08056669796022872, - "grad_norm": 0.14085927605628967, - "kl": 0.03192138671875, - "learning_rate": 8.047945205479451e-07, - "loss": 0.1663, - "num_tokens": 163999924.0, - "reward": 1.03271484375, - "reward_std": 0.3006260395050049, - "rewards/accuracy_reward/mean": 0.150390625, - "rewards/accuracy_reward/std": 0.35780346393585205, + "grad_norm": 0.14855922758579254, + "kl": 0.0213623046875, + "learning_rate": 8.020477815699659e-07, + "loss": 0.2454, + "num_tokens": 193284613.0, + "reward": 0.96923828125, + "reward_std": 0.3534427881240845, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88232421875, - "rewards/tag_count_reward/std": 0.2421872466802597, + "rewards/tag_count_reward/mean": 0.82080078125, + "rewards/tag_count_reward/std": 0.2616969048976898, "step": 236 }, { @@ -6859,27 +6859,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.15234375, + "completions/clipped_ratio": 0.185546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1206.373046875, - "completions/mean_terminated_length": 1055.1129150390625, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1144.033203125, + "completions/mean_terminated_length": 938.0935668945312, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.08090808227361952, - "grad_norm": 0.11820080131292343, - "kl": 0.029144287109375, - "learning_rate": 8.082191780821918e-07, - "loss": 0.1273, - "num_tokens": 164700675.0, - "reward": 1.02783203125, - "reward_std": 0.3371480107307434, - "rewards/accuracy_reward/mean": 0.162109375, - "rewards/accuracy_reward/std": 0.3689115643501282, + "grad_norm": 0.12940514087677002, + "kl": 0.018157958984375, + "learning_rate": 8.054607508532423e-07, + "loss": 0.1829, + "num_tokens": 193953446.0, + "reward": 0.9619140625, + "reward_std": 0.38028883934020996, + "rewards/accuracy_reward/mean": 0.173828125, + "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86572265625, - "rewards/tag_count_reward/std": 0.2490920126438141, + "rewards/tag_count_reward/mean": 0.7880859375, + "rewards/tag_count_reward/std": 0.29010871052742004, "step": 237 }, { @@ -6888,27 +6888,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.189453125, + "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 1126.564453125, - "completions/mean_terminated_length": 911.1928100585938, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1033.50390625, + "completions/mean_terminated_length": 774.9068603515625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, "epoch": 0.08124946658701032, - "grad_norm": 3.9634222984313965, - "kl": 0.07635498046875, - "learning_rate": 8.116438356164384e-07, - "loss": 0.1334, - "num_tokens": 165351540.0, - "reward": 0.95947265625, - "reward_std": 0.2637588679790497, + "grad_norm": 0.14404235780239105, + "kl": 0.021697998046875, + "learning_rate": 8.088737201365188e-07, + "loss": 0.2078, + "num_tokens": 194556664.0, + "reward": 0.9169921875, + "reward_std": 0.3091548681259155, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85205078125, - "rewards/tag_count_reward/std": 0.2766353189945221, + "rewards/tag_count_reward/mean": 0.8095703125, + "rewards/tag_count_reward/std": 0.28858959674835205, "step": 238 }, { @@ -6917,27 +6917,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08203125, + "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1002.380859375, - "completions/mean_terminated_length": 908.9425048828125, - "completions/min_length": 233.0, - "completions/min_terminated_length": 233.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 974.3046875, + "completions/mean_terminated_length": 760.5714111328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, "epoch": 0.08159085090040112, - "grad_norm": 0.15155275166034698, - "kl": 0.032318115234375, - "learning_rate": 8.150684931506849e-07, - "loss": 0.0906, - "num_tokens": 165945431.0, - "reward": 1.052734375, - "reward_std": 0.2858385145664215, - "rewards/accuracy_reward/mean": 0.14453125, - "rewards/accuracy_reward/std": 0.35197147727012634, + "grad_norm": 0.16828224062919617, + "kl": 0.02117919921875, + "learning_rate": 8.122866894197952e-07, + "loss": 0.2004, + "num_tokens": 195136180.0, + "reward": 0.94482421875, + "reward_std": 0.3471168875694275, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.908203125, - "rewards/tag_count_reward/std": 0.21358256042003632, + "rewards/tag_count_reward/mean": 0.80419921875, + "rewards/tag_count_reward/std": 0.29028117656707764, "step": 239 }, { @@ -6946,27 +6946,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.203125, + "completions/clipped_ratio": 0.16796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 1200.95703125, - "completions/mean_terminated_length": 985.0441284179688, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1041.275390625, + "completions/mean_terminated_length": 838.0399169921875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, "epoch": 0.08193223521379192, - "grad_norm": 0.1461395025253296, - "kl": 0.03375244140625, - "learning_rate": 8.184931506849315e-07, - "loss": 0.2138, - "num_tokens": 166633793.0, - "reward": 0.9755859375, - "reward_std": 0.33675751090049744, - "rewards/accuracy_reward/mean": 0.14516128599643707, - "rewards/accuracy_reward/std": 0.3526190221309662, + "grad_norm": 0.14916828274726868, + "kl": 0.023284912109375, + "learning_rate": 8.156996587030717e-07, + "loss": 0.2065, + "num_tokens": 195742785.0, + "reward": 0.962890625, + "reward_std": 0.3675592541694641, + "rewards/accuracy_reward/mean": 0.1572580635547638, + "rewards/accuracy_reward/std": 0.36441144347190857, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8349609375, - "rewards/tag_count_reward/std": 0.2885763645172119, + "rewards/tag_count_reward/mean": 0.810546875, + "rewards/tag_count_reward/std": 0.28281864523887634, "step": 240 }, { @@ -6975,27 +6975,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.134765625, + "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1947.0, - "completions/mean_length": 1022.099609375, - "completions/mean_terminated_length": 862.3092651367188, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 987.701171875, + "completions/mean_terminated_length": 743.016845703125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, "epoch": 0.08227361952718272, - "grad_norm": 0.152372807264328, - "kl": 0.03533935546875, - "learning_rate": 8.21917808219178e-07, - "loss": 0.1733, - "num_tokens": 167230468.0, - "reward": 1.01318359375, - "reward_std": 0.3202892541885376, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, + "grad_norm": 0.1716306358575821, + "kl": 0.02398681640625, + "learning_rate": 8.19112627986348e-07, + "loss": 0.1742, + "num_tokens": 196321848.0, + "reward": 0.96044921875, + "reward_std": 0.3217233121395111, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88037109375, - "rewards/tag_count_reward/std": 0.2477303296327591, + "rewards/tag_count_reward/mean": 0.81591796875, + "rewards/tag_count_reward/std": 0.2908008098602295, "step": 241 }, { @@ -7004,27 +7004,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12109375, + "completions/clipped_ratio": 0.189453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 1018.638671875, - "completions/mean_terminated_length": 876.8155517578125, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 1032.234375, + "completions/mean_terminated_length": 794.814453125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, "epoch": 0.08261500384057352, - "grad_norm": 0.14142490923404694, - "kl": 0.031280517578125, - "learning_rate": 8.253424657534246e-07, - "loss": 0.127, - "num_tokens": 167830507.0, - "reward": 1.02880859375, - "reward_std": 0.27235502004623413, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, + "grad_norm": 0.14463070034980774, + "kl": 0.019287109375, + "learning_rate": 8.225255972696245e-07, + "loss": 0.2317, + "num_tokens": 196928848.0, + "reward": 0.92724609375, + "reward_std": 0.3264089822769165, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89599609375, - "rewards/tag_count_reward/std": 0.23049886524677277, + "rewards/tag_count_reward/mean": 0.80810546875, + "rewards/tag_count_reward/std": 0.2831153869628906, "step": 242 }, { @@ -7033,27 +7033,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1484375, + "completions/clipped_ratio": 0.2109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 1119.408203125, - "completions/mean_terminated_length": 957.5435180664062, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1126.90625, + "completions/mean_terminated_length": 880.6732788085938, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.08295638815396432, - "grad_norm": 0.1320219784975052, - "kl": 0.031768798828125, - "learning_rate": 8.287671232876712e-07, - "loss": 0.137, - "num_tokens": 168483116.0, - "reward": 0.9833984375, - "reward_std": 0.2867240011692047, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, + "grad_norm": 0.12367022782564163, + "kl": 0.020111083984375, + "learning_rate": 8.259385665529009e-07, + "loss": 0.1738, + "num_tokens": 197585296.0, + "reward": 0.94873046875, + "reward_std": 0.36049264669418335, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8603515625, - "rewards/tag_count_reward/std": 0.26868578791618347, + "rewards/tag_count_reward/mean": 0.78076171875, + "rewards/tag_count_reward/std": 0.3035278618335724, "step": 243 }, { @@ -7062,27 +7062,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.130859375, + "completions/clipped_ratio": 0.189453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1146.990234375, - "completions/mean_terminated_length": 1011.3325805664062, - "completions/min_length": 66.0, - "completions/min_terminated_length": 66.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1087.17578125, + "completions/mean_terminated_length": 862.5975952148438, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, "epoch": 0.08329777246735512, - "grad_norm": 0.12730756402015686, - "kl": 0.0306396484375, - "learning_rate": 8.321917808219178e-07, - "loss": 0.1091, - "num_tokens": 169149015.0, - "reward": 0.97607421875, - "reward_std": 0.3049416244029999, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, + "grad_norm": 0.14897318184375763, + "kl": 0.020477294921875, + "learning_rate": 8.293515358361775e-07, + "loss": 0.2221, + "num_tokens": 198220570.0, + "reward": 0.87255859375, + "reward_std": 0.33564668893814087, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87451171875, - "rewards/tag_count_reward/std": 0.25073230266571045, + "rewards/tag_count_reward/mean": 0.78662109375, + "rewards/tag_count_reward/std": 0.2967562675476074, "step": 244 }, { @@ -7091,27 +7091,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.134765625, + "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1037.759765625, - "completions/mean_terminated_length": 880.4085693359375, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1089.884765625, + "completions/mean_terminated_length": 796.5841674804688, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, "epoch": 0.08363915678074592, - "grad_norm": 0.15358765423297882, - "kl": 0.0301513671875, - "learning_rate": 8.356164383561643e-07, - "loss": 0.1985, - "num_tokens": 169754604.0, - "reward": 0.96484375, - "reward_std": 0.29754340648651123, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, + "grad_norm": 0.14930923283100128, + "kl": 0.021484375, + "learning_rate": 8.327645051194539e-07, + "loss": 0.2469, + "num_tokens": 198852847.0, + "reward": 0.8369140625, + "reward_std": 0.32589638233184814, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8828125, - "rewards/tag_count_reward/std": 0.2461792230606079, + "rewards/tag_count_reward/mean": 0.7568359375, + "rewards/tag_count_reward/std": 0.3123394250869751, "step": 245 }, { @@ -7120,27 +7120,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1640625, + "completions/clipped_ratio": 0.2421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 1161.271484375, - "completions/mean_terminated_length": 987.2406005859375, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1160.0625, + "completions/mean_terminated_length": 876.2886352539062, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.08398054109413672, - "grad_norm": 0.2361176609992981, - "kl": 0.03265380859375, - "learning_rate": 8.390410958904109e-07, - "loss": 0.1784, - "num_tokens": 170428327.0, - "reward": 0.94482421875, - "reward_std": 0.28001147508621216, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, + "grad_norm": 0.14294405281543732, + "kl": 0.021026611328125, + "learning_rate": 8.361774744027303e-07, + "loss": 0.226, + "num_tokens": 199525951.0, + "reward": 0.84326171875, + "reward_std": 0.32320284843444824, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85888671875, - "rewards/tag_count_reward/std": 0.26493385434150696, + "rewards/tag_count_reward/mean": 0.76513671875, + "rewards/tag_count_reward/std": 0.30909308791160583, "step": 246 }, { @@ -7149,27 +7149,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12109375, + "completions/clipped_ratio": 0.216796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 1140.546875, - "completions/mean_terminated_length": 1015.52001953125, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1147.544921875, + "completions/mean_terminated_length": 898.2918090820312, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, "epoch": 0.08432192540752752, - "grad_norm": 0.14363712072372437, - "kl": 0.0302734375, - "learning_rate": 8.424657534246576e-07, - "loss": 0.1529, - "num_tokens": 171094079.0, - "reward": 1.06689453125, - "reward_std": 0.3600861132144928, - "rewards/accuracy_reward/mean": 0.171875, - "rewards/accuracy_reward/std": 0.3776407241821289, + "grad_norm": 0.12825725972652435, + "kl": 0.019287109375, + "learning_rate": 8.395904436860067e-07, + "loss": 0.2296, + "num_tokens": 200195286.0, + "reward": 0.9365234375, + "reward_std": 0.36513566970825195, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89501953125, - "rewards/tag_count_reward/std": 0.23269794881343842, + "rewards/tag_count_reward/mean": 0.7861328125, + "rewards/tag_count_reward/std": 0.29866480827331543, "step": 247 }, { @@ -7178,27 +7178,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12890625, + "completions/clipped_ratio": 0.2109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 1115.400390625, - "completions/mean_terminated_length": 977.3924560546875, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1128.236328125, + "completions/mean_terminated_length": 882.35888671875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.08466330972091832, - "grad_norm": 0.15197043120861053, - "kl": 0.0330810546875, - "learning_rate": 8.458904109589042e-07, - "loss": 0.1532, - "num_tokens": 171746684.0, - "reward": 1.00927734375, - "reward_std": 0.28392645716667175, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, + "grad_norm": 0.1298787146806717, + "kl": 0.02008056640625, + "learning_rate": 8.430034129692832e-07, + "loss": 0.2165, + "num_tokens": 200854463.0, + "reward": 0.88427734375, + "reward_std": 0.34482067823410034, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88623046875, - "rewards/tag_count_reward/std": 0.23693007230758667, + "rewards/tag_count_reward/mean": 0.77685546875, + "rewards/tag_count_reward/std": 0.30106866359710693, "step": 248 }, { @@ -7207,27 +7207,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.134765625, + "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 1025.802734375, - "completions/mean_terminated_length": 866.5891723632812, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 1008.9453125, + "completions/mean_terminated_length": 802.107666015625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, "epoch": 0.08500469403430912, - "grad_norm": 0.15088102221488953, - "kl": 0.033111572265625, - "learning_rate": 8.493150684931506e-07, - "loss": 0.1955, - "num_tokens": 172352775.0, - "reward": 0.99365234375, - "reward_std": 0.2636690139770508, - "rewards/accuracy_reward/mean": 0.11088709533214569, - "rewards/accuracy_reward/std": 0.3143092691898346, + "grad_norm": 0.14396578073501587, + "kl": 0.021484375, + "learning_rate": 8.464163822525597e-07, + "loss": 0.2158, + "num_tokens": 201451923.0, + "reward": 0.923828125, + "reward_std": 0.35609889030456543, + "rewards/accuracy_reward/mean": 0.11290322244167328, + "rewards/accuracy_reward/std": 0.3167939782142639, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88623046875, - "rewards/tag_count_reward/std": 0.25048065185546875, + "rewards/tag_count_reward/mean": 0.814453125, + "rewards/tag_count_reward/std": 0.2858298718929291, "step": 249 }, { @@ -7236,27 +7236,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.09375, + "completions/clipped_ratio": 0.19140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 1015.931640625, - "completions/mean_terminated_length": 909.1659545898438, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1034.232421875, + "completions/mean_terminated_length": 794.2584228515625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, "epoch": 0.08534607834769992, - "grad_norm": 0.13097302615642548, - "kl": 0.03509521484375, - "learning_rate": 8.527397260273972e-07, - "loss": 0.12, - "num_tokens": 172945076.0, - "reward": 1.02294921875, - "reward_std": 0.25544142723083496, - "rewards/accuracy_reward/mean": 0.11666666716337204, - "rewards/accuracy_reward/std": 0.3213576078414917, + "grad_norm": 0.16448254883289337, + "kl": 0.023681640625, + "learning_rate": 8.498293515358362e-07, + "loss": 0.2889, + "num_tokens": 202053594.0, + "reward": 0.8896484375, + "reward_std": 0.34110236167907715, + "rewards/accuracy_reward/mean": 0.08749999850988388, + "rewards/accuracy_reward/std": 0.2828611731529236, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.2109440118074417, + "rewards/tag_count_reward/mean": 0.8076171875, + "rewards/tag_count_reward/std": 0.28600868582725525, "step": 250 }, { @@ -7265,27 +7265,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.10546875, + "completions/clipped_ratio": 0.18359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 1058.796875, - "completions/mean_terminated_length": 942.1659545898438, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1060.333984375, + "completions/mean_terminated_length": 838.2272338867188, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.08568746266109073, - "grad_norm": 0.14348679780960083, - "kl": 0.032196044921875, - "learning_rate": 8.561643835616438e-07, - "loss": 0.1364, - "num_tokens": 173562732.0, - "reward": 1.021484375, - "reward_std": 0.2810960114002228, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, + "grad_norm": 0.1441984474658966, + "kl": 0.021728515625, + "learning_rate": 8.532423208191127e-07, + "loss": 0.2662, + "num_tokens": 202672037.0, + "reward": 0.91162109375, + "reward_std": 0.3293203115463257, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.896484375, - "rewards/tag_count_reward/std": 0.22885605692863464, + "rewards/tag_count_reward/mean": 0.80224609375, + "rewards/tag_count_reward/std": 0.2960599660873413, "step": 251 }, { @@ -7294,27 +7294,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.138671875, + "completions/clipped_ratio": 0.201171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1133.736328125, - "completions/mean_terminated_length": 986.5419921875, - "completions/min_length": 201.0, - "completions/min_terminated_length": 201.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1088.607421875, + "completions/mean_terminated_length": 847.0000610351562, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, "epoch": 0.08602884697448153, - "grad_norm": 16.952478408813477, - "kl": 0.17535400390625, - "learning_rate": 8.595890410958903e-07, - "loss": 0.1573, - "num_tokens": 174229877.0, - "reward": 0.9892578125, - "reward_std": 0.27832746505737305, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, + "grad_norm": 0.15352463722229004, + "kl": 0.02069091796875, + "learning_rate": 8.56655290102389e-07, + "loss": 0.2408, + "num_tokens": 203316076.0, + "reward": 0.865234375, + "reward_std": 0.323127806186676, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8935546875, - "rewards/tag_count_reward/std": 0.22096005082130432, + "rewards/tag_count_reward/mean": 0.794921875, + "rewards/tag_count_reward/std": 0.2929096817970276, "step": 252 }, { @@ -7323,27 +7323,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.146484375, + "completions/clipped_ratio": 0.19921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1072.92578125, - "completions/mean_terminated_length": 905.5789184570312, - "completions/min_length": 124.0, - "completions/min_terminated_length": 124.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1110.3359375, + "completions/mean_terminated_length": 877.0634155273438, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, "epoch": 0.08637023128787233, - "grad_norm": 0.1386740356683731, - "kl": 0.03204345703125, - "learning_rate": 8.63013698630137e-07, - "loss": 0.2156, - "num_tokens": 174868047.0, - "reward": 0.95361328125, - "reward_std": 0.2634553909301758, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, + "grad_norm": 0.36992698907852173, + "kl": 0.02252197265625, + "learning_rate": 8.600682593856655e-07, + "loss": 0.2411, + "num_tokens": 203973400.0, + "reward": 0.8388671875, + "reward_std": 0.3086473047733307, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87939453125, - "rewards/tag_count_reward/std": 0.24225826561450958, + "rewards/tag_count_reward/mean": 0.7841796875, + "rewards/tag_count_reward/std": 0.2993040084838867, "step": 253 }, { @@ -7352,27 +7352,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1328125, + "completions/clipped_ratio": 0.19921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 1068.017578125, - "completions/mean_terminated_length": 917.93017578125, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1065.009765625, + "completions/mean_terminated_length": 820.4609985351562, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, "epoch": 0.08671161560126313, - "grad_norm": 6.681758880615234, - "kl": 0.114501953125, - "learning_rate": 8.664383561643836e-07, - "loss": 0.153, - "num_tokens": 175483912.0, - "reward": 1.0986328125, - "reward_std": 0.34474465250968933, - "rewards/accuracy_reward/mean": 0.21484375, - "rewards/accuracy_reward/std": 0.4111155867576599, + "grad_norm": 0.1553116887807846, + "kl": 0.0225830078125, + "learning_rate": 8.634812286689419e-07, + "loss": 0.2449, + "num_tokens": 204587725.0, + "reward": 0.95556640625, + "reward_std": 0.40006691217422485, + "rewards/accuracy_reward/mean": 0.166015625, + "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8837890625, - "rewards/tag_count_reward/std": 0.24910975992679596, + "rewards/tag_count_reward/mean": 0.78955078125, + "rewards/tag_count_reward/std": 0.2980254590511322, "step": 254 }, { @@ -7381,27 +7381,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12890625, + "completions/clipped_ratio": 0.25390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 1046.703125, - "completions/mean_terminated_length": 898.5291748046875, - "completions/min_length": 74.0, - "completions/min_terminated_length": 74.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1125.8359375, + "completions/mean_terminated_length": 812.010498046875, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, "epoch": 0.08705299991465393, - "grad_norm": 0.1366773396730423, - "kl": 0.032012939453125, - "learning_rate": 8.698630136986301e-07, - "loss": 0.1301, - "num_tokens": 176103568.0, - "reward": 0.99609375, - "reward_std": 0.3028711676597595, - "rewards/accuracy_reward/mean": 0.11693548411130905, - "rewards/accuracy_reward/std": 0.3216678202152252, + "grad_norm": 0.13251453638076782, + "kl": 0.02386474609375, + "learning_rate": 8.668941979522184e-07, + "loss": 0.2572, + "num_tokens": 205247897.0, + "reward": 0.84619140625, + "reward_std": 0.34312546253204346, + "rewards/accuracy_reward/mean": 0.0927419364452362, + "rewards/accuracy_reward/std": 0.2903633117675781, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8828125, - "rewards/tag_count_reward/std": 0.24115973711013794, + "rewards/tag_count_reward/mean": 0.75634765625, + "rewards/tag_count_reward/std": 0.3144960403442383, "step": 255 }, { @@ -7410,27 +7410,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08984375, + "completions/clipped_ratio": 0.185546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1050.291015625, - "completions/mean_terminated_length": 951.8047485351562, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1082.22265625, + "completions/mean_terminated_length": 862.2014770507812, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, "epoch": 0.08739438422804473, - "grad_norm": 0.1439121514558792, - "kl": 0.034271240234375, - "learning_rate": 8.732876712328767e-07, - "loss": 0.1452, - "num_tokens": 176720821.0, - "reward": 1.0263671875, - "reward_std": 0.2726992666721344, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, + "grad_norm": 0.20386654138565063, + "kl": 0.0267333984375, + "learning_rate": 8.703071672354948e-07, + "loss": 0.2064, + "num_tokens": 205881499.0, + "reward": 0.919921875, + "reward_std": 0.3333033323287964, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9033203125, - "rewards/tag_count_reward/std": 0.21824489533901215, + "rewards/tag_count_reward/mean": 0.810546875, + "rewards/tag_count_reward/std": 0.287961483001709, "step": 256 }, { @@ -7439,27 +7439,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12890625, + "completions/clipped_ratio": 0.19921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1063.634765625, - "completions/mean_terminated_length": 917.9664306640625, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1059.3828125, + "completions/mean_terminated_length": 813.4341430664062, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.08773576854143553, - "grad_norm": 0.15800635516643524, - "kl": 0.033660888671875, - "learning_rate": 8.767123287671232e-07, - "loss": 0.1738, - "num_tokens": 177340602.0, - "reward": 1.02734375, - "reward_std": 0.3319050967693329, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, + "grad_norm": 0.14897507429122925, + "kl": 0.0238037109375, + "learning_rate": 8.737201365187713e-07, + "loss": 0.238, + "num_tokens": 206499103.0, + "reward": 0.91796875, + "reward_std": 0.3448280394077301, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88671875, - "rewards/tag_count_reward/std": 0.24849718809127808, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.2912411689758301, "step": 257 }, { @@ -7468,27 +7468,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.13671875, + "completions/clipped_ratio": 0.19140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1123.80078125, - "completions/mean_terminated_length": 977.4344482421875, - "completions/min_length": 184.0, - "completions/min_terminated_length": 184.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1136.005859375, + "completions/mean_terminated_length": 920.1231689453125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, "epoch": 0.08807715285482633, - "grad_norm": 0.12673071026802063, - "kl": 0.030181884765625, - "learning_rate": 8.801369863013698e-07, - "loss": 0.1611, - "num_tokens": 177984964.0, - "reward": 0.96875, - "reward_std": 0.24542951583862305, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, + "grad_norm": 0.1268673688173294, + "kl": 0.021209716796875, + "learning_rate": 8.771331058020477e-07, + "loss": 0.2073, + "num_tokens": 207149714.0, + "reward": 0.9267578125, + "reward_std": 0.30336177349090576, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.890625, - "rewards/tag_count_reward/std": 0.2392502874135971, + "rewards/tag_count_reward/mean": 0.8310546875, + "rewards/tag_count_reward/std": 0.2771861255168915, "step": 258 }, { @@ -7497,27 +7497,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.134765625, + "completions/clipped_ratio": 0.1796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 1113.703125, - "completions/mean_terminated_length": 968.1806030273438, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1042.88671875, + "completions/mean_terminated_length": 822.7190551757812, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.08841853716821713, - "grad_norm": 132.47552490234375, - "kl": 1.1121826171875, - "learning_rate": 8.835616438356164e-07, - "loss": 0.2124, - "num_tokens": 178629612.0, - "reward": 1.0703125, - "reward_std": 0.3268173933029175, - "rewards/accuracy_reward/mean": 0.1796875, - "rewards/accuracy_reward/std": 0.38430243730545044, + "grad_norm": 0.15476509928703308, + "kl": 0.027069091796875, + "learning_rate": 8.805460750853242e-07, + "loss": 0.213, + "num_tokens": 207758104.0, + "reward": 0.96630859375, + "reward_std": 0.359427809715271, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.890625, - "rewards/tag_count_reward/std": 0.2428024560213089, + "rewards/tag_count_reward/mean": 0.81396484375, + "rewards/tag_count_reward/std": 0.2865806519985199, "step": 259 }, { @@ -7526,27 +7526,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1875, + "completions/clipped_ratio": 0.216796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 1119.72265625, - "completions/mean_terminated_length": 905.5048217773438, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1065.265625, + "completions/mean_terminated_length": 793.2369384765625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, "epoch": 0.08875992148160793, - "grad_norm": 0.15137813985347748, - "kl": 0.03277587890625, - "learning_rate": 8.86986301369863e-07, - "loss": 0.2252, - "num_tokens": 179282174.0, - "reward": 0.900390625, - "reward_std": 0.2901911735534668, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, + "grad_norm": 0.1515800952911377, + "kl": 0.026947021484375, + "learning_rate": 8.839590443686007e-07, + "loss": 0.2679, + "num_tokens": 208382784.0, + "reward": 0.82568359375, + "reward_std": 0.2970122694969177, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.853515625, - "rewards/tag_count_reward/std": 0.27631086111068726, + "rewards/tag_count_reward/mean": 0.78662109375, + "rewards/tag_count_reward/std": 0.298400342464447, "step": 260 }, { @@ -7555,27 +7555,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.134765625, + "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1087.89453125, - "completions/mean_terminated_length": 938.3521728515625, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 986.626953125, + "completions/mean_terminated_length": 775.3465576171875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.08910130579499873, - "grad_norm": 0.15067681670188904, - "kl": 0.0322265625, - "learning_rate": 8.904109589041095e-07, - "loss": 0.1344, - "num_tokens": 179913368.0, - "reward": 1.00244140625, - "reward_std": 0.2927597761154175, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, + "grad_norm": 0.1756720244884491, + "kl": 0.0289306640625, + "learning_rate": 8.873720136518771e-07, + "loss": 0.202, + "num_tokens": 208962129.0, + "reward": 0.9228515625, + "reward_std": 0.29644083976745605, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88916015625, - "rewards/tag_count_reward/std": 0.2403615117073059, + "rewards/tag_count_reward/mean": 0.8310546875, + "rewards/tag_count_reward/std": 0.27318599820137024, "step": 261 }, { @@ -7584,27 +7584,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12109375, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 1004.69140625, - "completions/mean_terminated_length": 860.9467163085938, - "completions/min_length": 222.0, - "completions/min_terminated_length": 222.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 856.3671875, + "completions/mean_terminated_length": 712.9540405273438, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.08944269010838952, - "grad_norm": 0.15569855272769928, - "kl": 0.036041259765625, - "learning_rate": 8.938356164383561e-07, - "loss": 0.1649, - "num_tokens": 180502698.0, - "reward": 1.03076171875, - "reward_std": 0.3312453031539917, - "rewards/accuracy_reward/mean": 0.146484375, - "rewards/accuracy_reward/std": 0.35393697023391724, + "grad_norm": 0.16520744562149048, + "kl": 0.0257568359375, + "learning_rate": 8.907849829351535e-07, + "loss": 0.1794, + "num_tokens": 209475517.0, + "reward": 1.00439453125, + "reward_std": 0.3185638189315796, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88427734375, - "rewards/tag_count_reward/std": 0.24312838912010193, + "rewards/tag_count_reward/mean": 0.87744140625, + "rewards/tag_count_reward/std": 0.24076665937900543, "step": 262 }, { @@ -7613,27 +7613,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.09375, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 975.12109375, - "completions/mean_terminated_length": 864.1336059570312, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 882.658203125, + "completions/mean_terminated_length": 727.966796875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, "epoch": 0.08978407442178032, - "grad_norm": 0.1347164362668991, - "kl": 0.03118896484375, - "learning_rate": 8.972602739726027e-07, - "loss": 0.1334, - "num_tokens": 181075736.0, - "reward": 1.05859375, - "reward_std": 0.30694493651390076, - "rewards/accuracy_reward/mean": 0.134765625, - "rewards/accuracy_reward/std": 0.3418070077896118, + "grad_norm": 0.15659339725971222, + "kl": 0.02850341796875, + "learning_rate": 8.941979522184299e-07, + "loss": 0.1734, + "num_tokens": 210001214.0, + "reward": 1.00341796875, + "reward_std": 0.3333035409450531, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.2016531378030777, + "rewards/tag_count_reward/mean": 0.86669921875, + "rewards/tag_count_reward/std": 0.24961701035499573, "step": 263 }, { @@ -7642,27 +7642,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.146484375, + "completions/clipped_ratio": 0.17578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1099.060546875, - "completions/mean_terminated_length": 936.1990356445312, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1048.224609375, + "completions/mean_terminated_length": 835.0023803710938, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.09012545873517112, - "grad_norm": 0.14101961255073547, - "kl": 0.032318115234375, - "learning_rate": 9.006849315068494e-07, - "loss": 0.1613, - "num_tokens": 181722023.0, - "reward": 0.96044921875, - "reward_std": 0.2774454951286316, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, + "grad_norm": 0.14353342354297638, + "kl": 0.027191162109375, + "learning_rate": 8.976109215017065e-07, + "loss": 0.1947, + "num_tokens": 210621473.0, + "reward": 0.9267578125, + "reward_std": 0.31127870082855225, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88037109375, - "rewards/tag_count_reward/std": 0.2417331039905548, + "rewards/tag_count_reward/mean": 0.8408203125, + "rewards/tag_count_reward/std": 0.2650623321533203, "step": 264 }, { @@ -7671,27 +7671,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.158203125, + "completions/clipped_ratio": 0.212890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 1101.751953125, - "completions/mean_terminated_length": 923.9187622070312, - "completions/min_length": 237.0, - "completions/min_terminated_length": 237.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1080.56640625, + "completions/mean_terminated_length": 818.9031982421875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.09046684304856192, - "grad_norm": 0.14234143495559692, - "kl": 0.03485107421875, - "learning_rate": 9.041095890410958e-07, - "loss": 0.1435, - "num_tokens": 182367832.0, - "reward": 1.01416015625, - "reward_std": 0.2969135642051697, - "rewards/accuracy_reward/mean": 0.146484375, - "rewards/accuracy_reward/std": 0.35393697023391724, + "grad_norm": 0.15938018262386322, + "kl": 0.026763916015625, + "learning_rate": 9.010238907849829e-07, + "loss": 0.2906, + "num_tokens": 211256435.0, + "reward": 0.9267578125, + "reward_std": 0.3406500816345215, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86767578125, - "rewards/tag_count_reward/std": 0.26207634806632996, + "rewards/tag_count_reward/mean": 0.8056640625, + "rewards/tag_count_reward/std": 0.30097225308418274, "step": 265 }, { @@ -7700,27 +7700,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.083984375, + "completions/clipped_ratio": 0.146484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1116.673828125, - "completions/mean_terminated_length": 1031.2857666015625, - "completions/min_length": 275.0, - "completions/min_terminated_length": 275.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1127.380859375, + "completions/mean_terminated_length": 969.3798217773438, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, "epoch": 0.09080822736195272, - "grad_norm": 0.12336399406194687, - "kl": 0.026763916015625, - "learning_rate": 9.075342465753424e-07, - "loss": 0.1345, - "num_tokens": 183014945.0, - "reward": 1.05224609375, - "reward_std": 0.2811427116394043, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, + "grad_norm": 0.11941824108362198, + "kl": 0.023712158203125, + "learning_rate": 9.044368600682594e-07, + "loss": 0.1874, + "num_tokens": 211909030.0, + "reward": 0.93896484375, + "reward_std": 0.2952817678451538, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.19962850213050842, + "rewards/tag_count_reward/mean": 0.83349609375, + "rewards/tag_count_reward/std": 0.27222371101379395, "step": 266 }, { @@ -7729,27 +7729,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.130859375, + "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 1091.169921875, - "completions/mean_terminated_length": 947.1078491210938, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1053.94921875, + "completions/mean_terminated_length": 856.0702514648438, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, "epoch": 0.09114961167534352, - "grad_norm": 0.12977954745292664, - "kl": 0.028839111328125, - "learning_rate": 9.10958904109589e-07, - "loss": 0.1588, - "num_tokens": 183651832.0, - "reward": 1.083984375, - "reward_std": 0.32412779331207275, - "rewards/accuracy_reward/mean": 0.19140625, - "rewards/accuracy_reward/std": 0.3937928080558777, + "grad_norm": 0.12648114562034607, + "kl": 0.025543212890625, + "learning_rate": 9.078498293515358e-07, + "loss": 0.1808, + "num_tokens": 212526860.0, + "reward": 1.001953125, + "reward_std": 0.3504851460456848, + "rewards/accuracy_reward/mean": 0.166015625, + "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.892578125, - "rewards/tag_count_reward/std": 0.23602545261383057, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.27481982111930847, "step": 267 }, { @@ -7758,27 +7758,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.146484375, + "completions/clipped_ratio": 0.185546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 1074.22265625, - "completions/mean_terminated_length": 907.0983276367188, - "completions/min_length": 81.0, - "completions/min_terminated_length": 81.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1063.12890625, + "completions/mean_terminated_length": 838.7578125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.09149099598873432, - "grad_norm": 0.41138696670532227, - "kl": 0.034881591796875, - "learning_rate": 9.143835616438355e-07, - "loss": 0.1652, - "num_tokens": 184278586.0, - "reward": 1.01171875, - "reward_std": 0.2644849717617035, + "grad_norm": 0.14063085615634918, + "kl": 0.02874755859375, + "learning_rate": 9.112627986348122e-07, + "loss": 0.2123, + "num_tokens": 213147934.0, + "reward": 0.95947265625, + "reward_std": 0.31506675481796265, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.880859375, - "rewards/tag_count_reward/std": 0.24423858523368835, + "rewards/tag_count_reward/mean": 0.82861328125, + "rewards/tag_count_reward/std": 0.2750140130519867, "step": 268 }, { @@ -7787,27 +7787,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.146484375, + "completions/clipped_ratio": 0.2109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 1102.029296875, - "completions/mean_terminated_length": 939.6773071289062, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1097.771484375, + "completions/mean_terminated_length": 843.75, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, "epoch": 0.09183238030212512, - "grad_norm": 0.13787169754505157, - "kl": 0.03399658203125, - "learning_rate": 9.178082191780822e-07, - "loss": 0.1477, - "num_tokens": 184915593.0, - "reward": 1.0498046875, - "reward_std": 0.36720407009124756, - "rewards/accuracy_reward/mean": 0.177734375, - "rewards/accuracy_reward/std": 0.3826628625392914, + "grad_norm": 0.13077448308467865, + "kl": 0.0252685546875, + "learning_rate": 9.146757679180886e-07, + "loss": 0.2463, + "num_tokens": 213782761.0, + "reward": 0.94384765625, + "reward_std": 0.4028789699077606, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8720703125, - "rewards/tag_count_reward/std": 0.25887611508369446, + "rewards/tag_count_reward/mean": 0.79736328125, + "rewards/tag_count_reward/std": 0.30380475521087646, "step": 269 }, { @@ -7816,27 +7816,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1015625, + "completions/clipped_ratio": 0.158203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1055.0078125, - "completions/mean_terminated_length": 942.7564697265625, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 999.697265625, + "completions/mean_terminated_length": 802.6844482421875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, "epoch": 0.09217376461551592, - "grad_norm": 0.14218027889728546, - "kl": 0.03228759765625, - "learning_rate": 9.212328767123288e-07, - "loss": 0.1332, - "num_tokens": 185527117.0, - "reward": 1.08154296875, - "reward_std": 0.2797403633594513, - "rewards/accuracy_reward/mean": 0.171875, - "rewards/accuracy_reward/std": 0.3776407241821289, + "grad_norm": 0.15468211472034454, + "kl": 0.0260009765625, + "learning_rate": 9.180887372013651e-07, + "loss": 0.2588, + "num_tokens": 214365966.0, + "reward": 1.02880859375, + "reward_std": 0.3489936590194702, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.21676163375377655, + "rewards/tag_count_reward/mean": 0.84912109375, + "rewards/tag_count_reward/std": 0.26874756813049316, "step": 270 }, { @@ -7845,27 +7845,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.087890625, + "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 969.294921875, - "completions/mean_terminated_length": 865.3511962890625, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1009.068359375, + "completions/mean_terminated_length": 802.2552490234375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, "epoch": 0.09251514892890672, - "grad_norm": 0.15893812477588654, - "kl": 0.031646728515625, - "learning_rate": 9.246575342465753e-07, - "loss": 0.1461, - "num_tokens": 186097316.0, - "reward": 1.1025390625, - "reward_std": 0.27628418803215027, - "rewards/accuracy_reward/mean": 0.1875, - "rewards/accuracy_reward/std": 0.39069411158561707, + "grad_norm": 0.15266716480255127, + "kl": 0.0250244140625, + "learning_rate": 9.215017064846417e-07, + "loss": 0.2024, + "num_tokens": 214956529.0, + "reward": 0.95654296875, + "reward_std": 0.34085455536842346, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.20715762674808502, + "rewards/tag_count_reward/mean": 0.82568359375, + "rewards/tag_count_reward/std": 0.28284189105033875, "step": 271 }, { @@ -7874,27 +7874,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.095703125, + "completions/clipped_ratio": 0.162109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1927.0, - "completions/mean_length": 973.080078125, - "completions/mean_terminated_length": 859.3196411132812, - "completions/min_length": 88.0, - "completions/min_terminated_length": 88.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 950.8125, + "completions/mean_terminated_length": 738.5361328125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, "epoch": 0.09285653324229752, - "grad_norm": 0.17674598097801208, - "kl": 0.033203125, - "learning_rate": 9.280821917808219e-07, - "loss": 0.1953, - "num_tokens": 186669629.0, - "reward": 1.05517578125, - "reward_std": 0.2933085262775421, - "rewards/accuracy_reward/mean": 0.146484375, - "rewards/accuracy_reward/std": 0.35393697023391724, + "grad_norm": 0.2611883878707886, + "kl": 0.032440185546875, + "learning_rate": 9.249146757679181e-07, + "loss": 0.2428, + "num_tokens": 215517441.0, + "reward": 0.962890625, + "reward_std": 0.2771681547164917, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90869140625, - "rewards/tag_count_reward/std": 0.22027301788330078, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.2707847058773041, "step": 272 }, { @@ -7903,27 +7903,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.14453125, + "completions/clipped_ratio": 0.185546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1073.92578125, - "completions/mean_terminated_length": 909.3561401367188, - "completions/min_length": 199.0, - "completions/min_terminated_length": 199.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1052.43359375, + "completions/mean_terminated_length": 825.6259155273438, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.09319791755568832, - "grad_norm": 0.6983135342597961, - "kl": 0.04931640625, - "learning_rate": 9.315068493150684e-07, - "loss": 0.1051, - "num_tokens": 187297543.0, - "reward": 1.0546875, - "reward_std": 0.3062615394592285, - "rewards/accuracy_reward/mean": 0.169921875, - "rewards/accuracy_reward/std": 0.3759314715862274, + "grad_norm": 0.14994241297245026, + "kl": 0.0274658203125, + "learning_rate": 9.283276450511945e-07, + "loss": 0.1972, + "num_tokens": 216134351.0, + "reward": 0.974609375, + "reward_std": 0.3396124839782715, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.884765625, - "rewards/tag_count_reward/std": 0.24660581350326538, + "rewards/tag_count_reward/mean": 0.802734375, + "rewards/tag_count_reward/std": 0.29327481985092163, "step": 273 }, { @@ -7932,27 +7932,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.107421875, + "completions/clipped_ratio": 0.16796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1065.720703125, - "completions/mean_terminated_length": 947.5032958984375, - "completions/min_length": 72.0, - "completions/min_terminated_length": 72.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1051.017578125, + "completions/mean_terminated_length": 849.7488403320312, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, "epoch": 0.09353930186907912, - "grad_norm": 0.14864228665828705, - "kl": 0.032867431640625, - "learning_rate": 9.349315068493149e-07, - "loss": 0.1027, - "num_tokens": 187914568.0, - "reward": 0.96630859375, - "reward_std": 0.25238245725631714, - "rewards/accuracy_reward/mean": 0.06451612710952759, - "rewards/accuracy_reward/std": 0.2459181249141693, + "grad_norm": 0.14378777146339417, + "kl": 0.027984619140625, + "learning_rate": 9.317406143344709e-07, + "loss": 0.1863, + "num_tokens": 216743848.0, + "reward": 0.8837890625, + "reward_std": 0.30530285835266113, + "rewards/accuracy_reward/mean": 0.05443548411130905, + "rewards/accuracy_reward/std": 0.227104052901268, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90380859375, - "rewards/tag_count_reward/std": 0.22207027673721313, + "rewards/tag_count_reward/mean": 0.8310546875, + "rewards/tag_count_reward/std": 0.276744544506073, "step": 274 }, { @@ -7961,27 +7961,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.11328125, + "completions/clipped_ratio": 0.1640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1017.689453125, - "completions/mean_terminated_length": 886.0638427734375, - "completions/min_length": 83.0, - "completions/min_terminated_length": 83.0, + "completions/mean_length": 1040.619140625, + "completions/mean_terminated_length": 842.9088745117188, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.09388068618246992, - "grad_norm": 0.1409551501274109, - "kl": 0.031463623046875, - "learning_rate": 9.383561643835616e-07, - "loss": 0.1503, - "num_tokens": 188518681.0, - "reward": 1.00537109375, - "reward_std": 0.2579900920391083, - "rewards/accuracy_reward/mean": 0.11088709533214569, - "rewards/accuracy_reward/std": 0.3143092691898346, + "grad_norm": 0.3193594813346863, + "kl": 0.030029296875, + "learning_rate": 9.351535836177474e-07, + "loss": 0.1679, + "num_tokens": 217359701.0, + "reward": 0.89013671875, + "reward_std": 0.3236680328845978, + "rewards/accuracy_reward/mean": 0.0786290317773819, + "rewards/accuracy_reward/std": 0.26943066716194153, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89794921875, - "rewards/tag_count_reward/std": 0.2318999469280243, + "rewards/tag_count_reward/mean": 0.81396484375, + "rewards/tag_count_reward/std": 0.2895527482032776, "step": 275 }, { @@ -7990,27 +7990,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.091796875, + "completions/clipped_ratio": 0.158203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1094.66015625, - "completions/mean_terminated_length": 998.3010864257812, - "completions/min_length": 218.0, - "completions/min_terminated_length": 218.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1103.12890625, + "completions/mean_terminated_length": 925.5545043945312, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, "epoch": 0.09422207049586072, - "grad_norm": 0.13870646059513092, - "kl": 0.03155517578125, - "learning_rate": 9.417808219178082e-07, - "loss": 0.114, - "num_tokens": 189151483.0, - "reward": 1.107421875, - "reward_std": 0.3174337148666382, - "rewards/accuracy_reward/mean": 0.185546875, - "rewards/accuracy_reward/std": 0.38912075757980347, + "grad_norm": 0.13808484375476837, + "kl": 0.027313232421875, + "learning_rate": 9.385665529010238e-07, + "loss": 0.1576, + "num_tokens": 217996839.0, + "reward": 0.9833984375, + "reward_std": 0.3657929301261902, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.20272120833396912, + "rewards/tag_count_reward/mean": 0.8232421875, + "rewards/tag_count_reward/std": 0.27803975343704224, "step": 276 }, { @@ -8021,25 +8021,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.185546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1997.0, - "completions/mean_length": 1112.8125, - "completions/mean_terminated_length": 899.7601928710938, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1013.66015625, + "completions/mean_terminated_length": 778.0192260742188, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, "epoch": 0.09456345480925152, - "grad_norm": 0.1267813742160797, - "kl": 0.033416748046875, - "learning_rate": 9.452054794520548e-07, - "loss": 0.1516, - "num_tokens": 189795563.0, - "reward": 0.9833984375, - "reward_std": 0.3141370415687561, - "rewards/accuracy_reward/mean": 0.13709677755832672, - "rewards/accuracy_reward/std": 0.34429675340652466, + "grad_norm": 0.13520270586013794, + "kl": 0.0279541015625, + "learning_rate": 9.419795221843004e-07, + "loss": 0.218, + "num_tokens": 218590153.0, + "reward": 0.90869140625, + "reward_std": 0.3093388080596924, + "rewards/accuracy_reward/mean": 0.09072580933570862, + "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8505859375, - "rewards/tag_count_reward/std": 0.27871251106262207, + "rewards/tag_count_reward/mean": 0.82080078125, + "rewards/tag_count_reward/std": 0.29300713539123535, "step": 277 }, { @@ -8048,27 +8048,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.189453125, + "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1171.390625, - "completions/mean_terminated_length": 966.4963989257812, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1101.97265625, + "completions/mean_terminated_length": 860.8284912109375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, "epoch": 0.09490483912264232, - "grad_norm": 0.1538221538066864, - "kl": 0.034454345703125, - "learning_rate": 9.486301369863013e-07, - "loss": 0.19, - "num_tokens": 190471267.0, - "reward": 1.01171875, - "reward_std": 0.35299718379974365, - "rewards/accuracy_reward/mean": 0.16015625, - "rewards/accuracy_reward/std": 0.3671095669269562, + "grad_norm": 0.17787189781665802, + "kl": 0.035247802734375, + "learning_rate": 9.453924914675768e-07, + "loss": 0.2535, + "num_tokens": 219230315.0, + "reward": 0.93603515625, + "reward_std": 0.37049442529678345, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8515625, - "rewards/tag_count_reward/std": 0.27481982111930847, + "rewards/tag_count_reward/mean": 0.79931640625, + "rewards/tag_count_reward/std": 0.2977977395057678, "step": 278 }, { @@ -8077,27 +8077,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.158203125, + "completions/clipped_ratio": 0.20703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1120.703125, - "completions/mean_terminated_length": 946.4315185546875, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1113.8203125, + "completions/mean_terminated_length": 869.921142578125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.09524622343603312, - "grad_norm": 0.13736364245414734, - "kl": 0.03326416015625, - "learning_rate": 9.520547945205479e-07, - "loss": 0.1663, - "num_tokens": 191132507.0, - "reward": 1.0107421875, - "reward_std": 0.29221630096435547, - "rewards/accuracy_reward/mean": 0.14453125, - "rewards/accuracy_reward/std": 0.35197147727012634, + "grad_norm": 0.1434744894504547, + "kl": 0.02899169921875, + "learning_rate": 9.488054607508532e-07, + "loss": 0.2241, + "num_tokens": 219888031.0, + "reward": 0.93896484375, + "reward_std": 0.341295063495636, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8662109375, - "rewards/tag_count_reward/std": 0.26342782378196716, + "rewards/tag_count_reward/mean": 0.80810546875, + "rewards/tag_count_reward/std": 0.2949638068675995, "step": 279 }, { @@ -8106,27 +8106,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1484375, + "completions/clipped_ratio": 0.13671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 1139.353515625, - "completions/mean_terminated_length": 980.965576171875, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1070.541015625, + "completions/mean_terminated_length": 915.7398681640625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, "epoch": 0.09558760774942392, - "grad_norm": 0.14240288734436035, - "kl": 0.031890869140625, - "learning_rate": 9.554794520547946e-07, - "loss": 0.1552, - "num_tokens": 191784656.0, - "reward": 0.9638671875, - "reward_std": 0.33411771059036255, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, + "grad_norm": 0.13831409811973572, + "kl": 0.0296630859375, + "learning_rate": 9.522184300341296e-07, + "loss": 0.1834, + "num_tokens": 220504948.0, + "reward": 0.90380859375, + "reward_std": 0.31455230712890625, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8662109375, - "rewards/tag_count_reward/std": 0.26249760389328003, + "rewards/tag_count_reward/mean": 0.83154296875, + "rewards/tag_count_reward/std": 0.2746037244796753, "step": 280 }, { @@ -8135,27 +8135,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.154296875, + "completions/clipped_ratio": 0.23046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 1133.05078125, - "completions/mean_terminated_length": 966.1200561523438, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1126.234375, + "completions/mean_terminated_length": 850.1725463867188, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.09592899206281472, - "grad_norm": 0.14882326126098633, - "kl": 0.032806396484375, - "learning_rate": 9.58904109589041e-07, - "loss": 0.1661, - "num_tokens": 192439498.0, - "reward": 1.00732421875, - "reward_std": 0.3253936469554901, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, + "grad_norm": 0.13532893359661102, + "kl": 0.02655029296875, + "learning_rate": 9.556313993174062e-07, + "loss": 0.2558, + "num_tokens": 221156300.0, + "reward": 0.8955078125, + "reward_std": 0.38487786054611206, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88232421875, - "rewards/tag_count_reward/std": 0.24469931423664093, + "rewards/tag_count_reward/mean": 0.7783203125, + "rewards/tag_count_reward/std": 0.3099439740180969, "step": 281 }, { @@ -8164,27 +8164,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.130859375, + "completions/clipped_ratio": 0.158203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 1124.294921875, - "completions/mean_terminated_length": 985.22021484375, - "completions/min_length": 229.0, - "completions/min_terminated_length": 229.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1045.921875, + "completions/mean_terminated_length": 857.5962524414062, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, "epoch": 0.09627037637620552, - "grad_norm": 0.1329856961965561, - "kl": 0.02911376953125, - "learning_rate": 9.623287671232875e-07, - "loss": 0.1325, - "num_tokens": 193085665.0, - "reward": 1.037109375, - "reward_std": 0.29661017656326294, - "rewards/accuracy_reward/mean": 0.150390625, - "rewards/accuracy_reward/std": 0.35780346393585205, + "grad_norm": 0.13227953016757965, + "kl": 0.02691650390625, + "learning_rate": 9.590443686006826e-07, + "loss": 0.1855, + "num_tokens": 221762340.0, + "reward": 0.962890625, + "reward_std": 0.328264057636261, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88671875, - "rewards/tag_count_reward/std": 0.2389625608921051, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.2695113718509674, "step": 282 }, { @@ -8193,27 +8193,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1328125, + "completions/clipped_ratio": 0.21484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1120.37109375, - "completions/mean_terminated_length": 978.3018188476562, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1145.62109375, + "completions/mean_terminated_length": 898.7014770507812, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.09661176068959632, - "grad_norm": 0.1428883969783783, - "kl": 0.0323486328125, - "learning_rate": 9.657534246575343e-07, - "loss": 0.1881, - "num_tokens": 193747583.0, - "reward": 0.93994140625, - "reward_std": 0.2519282102584839, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, + "grad_norm": 0.14567619562149048, + "kl": 0.029205322265625, + "learning_rate": 9.62457337883959e-07, + "loss": 0.268, + "num_tokens": 222437186.0, + "reward": 0.841796875, + "reward_std": 0.3250593841075897, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89111328125, - "rewards/tag_count_reward/std": 0.2324761003255844, + "rewards/tag_count_reward/mean": 0.794921875, + "rewards/tag_count_reward/std": 0.3043770492076874, "step": 283 }, { @@ -8222,27 +8222,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1484375, + "completions/clipped_ratio": 0.1640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 1102.798828125, - "completions/mean_terminated_length": 938.0389404296875, - "completions/min_length": 199.0, - "completions/min_terminated_length": 199.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1029.9453125, + "completions/mean_terminated_length": 830.14013671875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, "epoch": 0.09695314500298712, - "grad_norm": 0.13743260502815247, - "kl": 0.030731201171875, - "learning_rate": 9.691780821917808e-07, - "loss": 0.1875, - "num_tokens": 194380648.0, - "reward": 1.044921875, - "reward_std": 0.3495681881904602, - "rewards/accuracy_reward/mean": 0.16796875, - "rewards/accuracy_reward/std": 0.374204158782959, + "grad_norm": 0.1334325075149536, + "kl": 0.025909423828125, + "learning_rate": 9.658703071672355e-07, + "loss": 0.2196, + "num_tokens": 223032950.0, + "reward": 0.9970703125, + "reward_std": 0.3637813329696655, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.876953125, - "rewards/tag_count_reward/std": 0.25459781289100647, + "rewards/tag_count_reward/mean": 0.8369140625, + "rewards/tag_count_reward/std": 0.2806803584098816, "step": 284 }, { @@ -8251,27 +8251,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1015625, + "completions/clipped_ratio": 0.2109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1021.318359375, - "completions/mean_terminated_length": 905.2586669921875, - "completions/min_length": 191.0, - "completions/min_terminated_length": 191.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1136.189453125, + "completions/mean_terminated_length": 892.4381103515625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, "epoch": 0.09729452931637791, - "grad_norm": 0.15407976508140564, - "kl": 0.033599853515625, - "learning_rate": 9.726027397260274e-07, - "loss": 0.1479, - "num_tokens": 194986955.0, - "reward": 1.06396484375, - "reward_std": 0.2892252802848816, - "rewards/accuracy_reward/mean": 0.1640625, - "rewards/accuracy_reward/std": 0.37069445848464966, + "grad_norm": 0.14422355592250824, + "kl": 0.02813720703125, + "learning_rate": 9.69283276450512e-07, + "loss": 0.2591, + "num_tokens": 223698071.0, + "reward": 0.95458984375, + "reward_std": 0.3851277530193329, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89990234375, - "rewards/tag_count_reward/std": 0.21471090614795685, + "rewards/tag_count_reward/mean": 0.80615234375, + "rewards/tag_count_reward/std": 0.2949281632900238, "step": 285 }, { @@ -8280,27 +8280,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.185546875, + "completions/clipped_ratio": 0.189453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 1134.236328125, - "completions/mean_terminated_length": 926.0647583007812, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1083.91796875, + "completions/mean_terminated_length": 858.5783081054688, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, "epoch": 0.09763591362976871, - "grad_norm": 0.13862177729606628, - "kl": 0.032135009765625, - "learning_rate": 9.76027397260274e-07, - "loss": 0.1517, - "num_tokens": 195647028.0, - "reward": 0.99853515625, - "reward_std": 0.3387994170188904, - "rewards/accuracy_reward/mean": 0.142578125, - "rewards/accuracy_reward/std": 0.3499840497970581, + "grad_norm": 0.13818009197711945, + "kl": 0.02630615234375, + "learning_rate": 9.726962457337883e-07, + "loss": 0.2129, + "num_tokens": 224332381.0, + "reward": 0.96923828125, + "reward_std": 0.3827122449874878, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85595703125, - "rewards/tag_count_reward/std": 0.27515992522239685, + "rewards/tag_count_reward/mean": 0.81494140625, + "rewards/tag_count_reward/std": 0.29269713163375854, "step": 286 }, { @@ -8309,27 +8309,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.111328125, + "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 1058.318359375, - "completions/mean_terminated_length": 934.3363037109375, - "completions/min_length": 193.0, - "completions/min_terminated_length": 193.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1039.78125, + "completions/mean_terminated_length": 839.0819091796875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.09797729794315951, - "grad_norm": 0.14303427934646606, - "kl": 0.02935791015625, - "learning_rate": 9.794520547945205e-07, - "loss": 0.1308, - "num_tokens": 196275799.0, - "reward": 1.021484375, - "reward_std": 0.2921723425388336, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, + "grad_norm": 0.14740139245986938, + "kl": 0.028076171875, + "learning_rate": 9.761092150170647e-07, + "loss": 0.2035, + "num_tokens": 224951661.0, + "reward": 0.935546875, + "reward_std": 0.3139309883117676, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91015625, - "rewards/tag_count_reward/std": 0.21892902255058289, + "rewards/tag_count_reward/mean": 0.849609375, + "rewards/tag_count_reward/std": 0.26742565631866455, "step": 287 }, { @@ -8338,27 +8338,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.11328125, + "completions/clipped_ratio": 0.162109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1092.0546875, - "completions/mean_terminated_length": 969.9295043945312, - "completions/min_length": 197.0, - "completions/min_terminated_length": 197.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1095.73828125, + "completions/mean_terminated_length": 911.5011596679688, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.09831868225655031, - "grad_norm": 0.13289646804332733, - "kl": 0.031463623046875, - "learning_rate": 9.828767123287671e-07, - "loss": 0.1347, - "num_tokens": 196916243.0, - "reward": 0.990234375, - "reward_std": 0.2881425619125366, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, + "grad_norm": 0.12596474587917328, + "kl": 0.024871826171875, + "learning_rate": 9.795221843003413e-07, + "loss": 0.1797, + "num_tokens": 225593991.0, + "reward": 0.91943359375, + "reward_std": 0.33202680945396423, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.900390625, - "rewards/tag_count_reward/std": 0.22684305906295776, + "rewards/tag_count_reward/mean": 0.83740234375, + "rewards/tag_count_reward/std": 0.2781200706958771, "step": 288 }, { @@ -8367,27 +8367,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.099609375, + "completions/clipped_ratio": 0.189453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 1079.505859375, - "completions/mean_terminated_length": 972.3622436523438, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1111.5546875, + "completions/mean_terminated_length": 892.6747436523438, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, "epoch": 0.09866006656994111, - "grad_norm": 0.15402287244796753, - "kl": 0.032745361328125, - "learning_rate": 9.863013698630137e-07, - "loss": 0.0843, - "num_tokens": 197546662.0, - "reward": 1.0712890625, - "reward_std": 0.2916201651096344, - "rewards/accuracy_reward/mean": 0.15625, - "rewards/accuracy_reward/std": 0.36344730854034424, + "grad_norm": 0.14945776760578156, + "kl": 0.028411865234375, + "learning_rate": 9.829351535836176e-07, + "loss": 0.21, + "num_tokens": 226240819.0, + "reward": 0.9716796875, + "reward_std": 0.3576521873474121, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.2077472060918808, + "rewards/tag_count_reward/mean": 0.8291015625, + "rewards/tag_count_reward/std": 0.2847094237804413, "step": 289 }, { @@ -8396,27 +8396,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.103515625, + "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1942.0, - "completions/mean_length": 1006.572265625, - "completions/mean_terminated_length": 886.3202514648438, - "completions/min_length": 254.0, - "completions/min_terminated_length": 254.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1049.82421875, + "completions/mean_terminated_length": 842.6557006835938, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.09900145088333191, - "grad_norm": 0.1325167715549469, - "kl": 0.03338623046875, - "learning_rate": 9.897260273972602e-07, - "loss": 0.0719, - "num_tokens": 198137787.0, - "reward": 1.00390625, - "reward_std": 0.23973451554775238, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, + "grad_norm": 0.1299966275691986, + "kl": 0.028961181640625, + "learning_rate": 9.863481228668942e-07, + "loss": 0.154, + "num_tokens": 226854089.0, + "reward": 0.94482421875, + "reward_std": 0.28372257947921753, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.20616121590137482, + "rewards/tag_count_reward/mean": 0.84716796875, + "rewards/tag_count_reward/std": 0.27574270963668823, "step": 290 }, { @@ -8425,27 +8425,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.177734375, + "completions/clipped_ratio": 0.216796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1207.25390625, - "completions/mean_terminated_length": 1025.5250244140625, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1203.43359375, + "completions/mean_terminated_length": 969.65087890625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, "epoch": 0.09934283519672271, - "grad_norm": 325970.25, - "kl": 1808.0221862792969, - "learning_rate": 9.931506849315068e-07, - "loss": 72.6033, - "num_tokens": 198833341.0, - "reward": 0.98974609375, - "reward_std": 0.27682816982269287, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, + "grad_norm": 0.13255862891674042, + "kl": 0.024932861328125, + "learning_rate": 9.897610921501706e-07, + "loss": 0.214, + "num_tokens": 227547687.0, + "reward": 0.9423828125, + "reward_std": 0.34205949306488037, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84912109375, - "rewards/tag_count_reward/std": 0.26692092418670654, + "rewards/tag_count_reward/mean": 0.8134765625, + "rewards/tag_count_reward/std": 0.2907141149044037, "step": 291 }, { @@ -8454,27 +8454,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1328125, + "completions/clipped_ratio": 0.173828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1907.0, - "completions/mean_length": 1068.892578125, - "completions/mean_terminated_length": 918.939208984375, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1085.421875, + "completions/mean_terminated_length": 882.8936157226562, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, "epoch": 0.09968421951011351, - "grad_norm": 0.16380666196346283, - "kl": 0.033355712890625, - "learning_rate": 9.965753424657534e-07, - "loss": 0.2172, - "num_tokens": 199456134.0, - "reward": 1.00732421875, - "reward_std": 0.31261372566223145, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, + "grad_norm": 0.16226981580257416, + "kl": 0.03125, + "learning_rate": 9.93174061433447e-07, + "loss": 0.1645, + "num_tokens": 228178943.0, + "reward": 0.9697265625, + "reward_std": 0.32159310579299927, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88037109375, - "rewards/tag_count_reward/std": 0.25261926651000977, + "rewards/tag_count_reward/mean": 0.8349609375, + "rewards/tag_count_reward/std": 0.2817133665084839, "step": 292 }, { @@ -8483,27 +8483,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.091796875, + "completions/clipped_ratio": 0.15234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1041.5, - "completions/mean_terminated_length": 939.7677612304688, - "completions/min_length": 247.0, - "completions/min_terminated_length": 247.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1029.17578125, + "completions/mean_terminated_length": 846.0691528320312, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.10002560382350431, - "grad_norm": 2.1582608222961426, - "kl": 0.0596923828125, - "learning_rate": 1e-06, - "loss": 0.1275, - "num_tokens": 200065414.0, - "reward": 1.09130859375, - "reward_std": 0.3151357173919678, - "rewards/accuracy_reward/mean": 0.177734375, - "rewards/accuracy_reward/std": 0.3826628625392914, + "grad_norm": 0.5379236936569214, + "kl": 0.03948974609375, + "learning_rate": 9.965870307167234e-07, + "loss": 0.2447, + "num_tokens": 228781913.0, + "reward": 1.025390625, + "reward_std": 0.36331331729888916, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.20978116989135742, + "rewards/tag_count_reward/mean": 0.861328125, + "rewards/tag_count_reward/std": 0.2650713324546814, "step": 293 }, { @@ -8512,27 +8512,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.111328125, + "completions/clipped_ratio": 0.14453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1068.361328125, - "completions/mean_terminated_length": 945.6373901367188, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1009.509765625, + "completions/mean_terminated_length": 834.0570678710938, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.10036698813689511, - "grad_norm": 0.18343022465705872, - "kl": 0.035888671875, - "learning_rate": 9.99999680653653e-07, - "loss": 0.1589, - "num_tokens": 200689711.0, - "reward": 0.9853515625, - "reward_std": 0.2748567759990692, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, + "grad_norm": 0.13812671601772308, + "kl": 0.027008056640625, + "learning_rate": 1e-06, + "loss": 0.1818, + "num_tokens": 229376078.0, + "reward": 0.97021484375, + "reward_std": 0.30007249116897583, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8955078125, - "rewards/tag_count_reward/std": 0.2326553463935852, + "rewards/tag_count_reward/mean": 0.86865234375, + "rewards/tag_count_reward/std": 0.2644248306751251, "step": 294 }, { @@ -8541,27 +8541,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.123046875, + "completions/clipped_ratio": 0.169921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1127.794921875, - "completions/mean_terminated_length": 998.6793212890625, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1069.103515625, + "completions/mean_terminated_length": 868.7175903320312, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, "epoch": 0.10070837245028591, - "grad_norm": 0.12994924187660217, - "kl": 0.03369140625, - "learning_rate": 9.999987226150655e-07, - "loss": 0.13, - "num_tokens": 201340918.0, - "reward": 0.9599609375, - "reward_std": 0.25544899702072144, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 0.14459733664989471, + "kl": 0.030517578125, + "learning_rate": 9.999996804113108e-07, + "loss": 0.2377, + "num_tokens": 229997235.0, + "reward": 0.89013671875, + "reward_std": 0.2950636148452759, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8935546875, - "rewards/tag_count_reward/std": 0.23750056326389313, + "rewards/tag_count_reward/mean": 0.84716796875, + "rewards/tag_count_reward/std": 0.27662840485572815, "step": 295 }, { @@ -8570,27 +8570,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.11328125, + "completions/clipped_ratio": 0.15234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1050.5390625, - "completions/mean_terminated_length": 923.110107421875, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 991.001953125, + "completions/mean_terminated_length": 801.0345458984375, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, "epoch": 0.10104975676367671, - "grad_norm": 0.1931072175502777, - "kl": 0.032958984375, - "learning_rate": 9.99997125885597e-07, - "loss": 0.1366, - "num_tokens": 201950554.0, - "reward": 1.06298828125, - "reward_std": 0.2893334925174713, - "rewards/accuracy_reward/mean": 0.15625, - "rewards/accuracy_reward/std": 0.36344730854034424, + "grad_norm": 0.14756450057029724, + "kl": 0.02978515625, + "learning_rate": 9.999987216456977e-07, + "loss": 0.2413, + "num_tokens": 230576388.0, + "reward": 0.9853515625, + "reward_std": 0.33799609541893005, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.22386592626571655, + "rewards/tag_count_reward/mean": 0.8544921875, + "rewards/tag_count_reward/std": 0.26830142736434937, "step": 296 }, { @@ -8599,27 +8599,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.14453125, + "completions/clipped_ratio": 0.2109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 1171.7890625, - "completions/mean_terminated_length": 1023.7533569335938, - "completions/min_length": 248.0, - "completions/min_terminated_length": 248.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1136.458984375, + "completions/mean_terminated_length": 892.7796630859375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, "epoch": 0.10139114107706751, - "grad_norm": 0.1361197531223297, - "kl": 0.031890869140625, - "learning_rate": 9.99994890467514e-07, - "loss": 0.1594, - "num_tokens": 202628494.0, - "reward": 1.037109375, - "reward_std": 0.32598963379859924, - "rewards/accuracy_reward/mean": 0.14453125, - "rewards/accuracy_reward/std": 0.35197147727012634, + "grad_norm": 0.14217402040958405, + "kl": 0.0286865234375, + "learning_rate": 9.999971237045224e-07, + "loss": 0.2314, + "num_tokens": 231236239.0, + "reward": 0.9287109375, + "reward_std": 0.36870306730270386, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.892578125, - "rewards/tag_count_reward/std": 0.24216407537460327, + "rewards/tag_count_reward/mean": 0.8095703125, + "rewards/tag_count_reward/std": 0.298178106546402, "step": 297 }, { @@ -8628,27 +8628,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1484375, + "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 1204.568359375, - "completions/mean_terminated_length": 1057.548095703125, - "completions/min_length": 214.0, - "completions/min_terminated_length": 214.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1177.875, + "completions/mean_terminated_length": 1004.6650390625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.10173252539045831, - "grad_norm": 0.12307877838611603, - "kl": 0.03009033203125, - "learning_rate": 9.999920163639891e-07, - "loss": 0.1402, - "num_tokens": 203318801.0, - "reward": 1.0185546875, - "reward_std": 0.3293812870979309, - "rewards/accuracy_reward/mean": 0.13671875, - "rewards/accuracy_reward/std": 0.3438861668109894, + "grad_norm": 0.1258356273174286, + "kl": 0.0262451171875, + "learning_rate": 9.999948865900542e-07, + "loss": 0.1849, + "num_tokens": 231912879.0, + "reward": 0.95849609375, + "reward_std": 0.36331701278686523, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8818359375, - "rewards/tag_count_reward/std": 0.2422015517950058, + "rewards/tag_count_reward/mean": 0.83349609375, + "rewards/tag_count_reward/std": 0.2775629162788391, "step": 298 }, { @@ -8657,27 +8657,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.169921875, + "completions/clipped_ratio": 0.185546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 1149.375, - "completions/mean_terminated_length": 965.421142578125, - "completions/min_length": 237.0, - "completions/min_terminated_length": 237.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1092.74609375, + "completions/mean_terminated_length": 875.122314453125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, "epoch": 0.10207390970384911, - "grad_norm": 0.13443991541862488, - "kl": 0.03179931640625, - "learning_rate": 9.999885035791019e-07, - "loss": 0.1773, - "num_tokens": 203987681.0, - "reward": 1.0078125, - "reward_std": 0.3582117259502411, - "rewards/accuracy_reward/mean": 0.15234375, - "rewards/accuracy_reward/std": 0.35970520973205566, + "grad_norm": 0.13738124072551727, + "kl": 0.030242919921875, + "learning_rate": 9.999920103054712e-07, + "loss": 0.2349, + "num_tokens": 232552765.0, + "reward": 0.978515625, + "reward_std": 0.3726804256439209, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85546875, - "rewards/tag_count_reward/std": 0.27064353227615356, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.2847379744052887, "step": 299 }, { @@ -8686,27 +8686,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.146484375, + "completions/clipped_ratio": 0.212890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 1174.439453125, - "completions/mean_terminated_length": 1024.5147705078125, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1169.267578125, + "completions/mean_terminated_length": 931.5955200195312, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, "epoch": 0.10241529401723991, - "grad_norm": 0.1388065218925476, - "kl": 0.0328369140625, - "learning_rate": 9.999843521178375e-07, - "loss": 0.1374, - "num_tokens": 204670050.0, - "reward": 0.9462890625, - "reward_std": 0.2924199104309082, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, + "grad_norm": 0.13634316623210907, + "kl": 0.029205322265625, + "learning_rate": 9.999884948548586e-07, + "loss": 0.1817, + "num_tokens": 233232486.0, + "reward": 0.927734375, + "reward_std": 0.3437793552875519, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8701171875, - "rewards/tag_count_reward/std": 0.2564731240272522, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.29551440477371216, "step": 300 }, { @@ -8715,27 +8715,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1171875, + "completions/clipped_ratio": 0.150390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1104.498046875, - "completions/mean_terminated_length": 979.25439453125, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1064.658203125, + "completions/mean_terminated_length": 890.5953979492188, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, "epoch": 0.10275667833063071, - "grad_norm": 0.12873072922229767, - "kl": 0.031036376953125, - "learning_rate": 9.99979561986089e-07, - "loss": 0.1286, - "num_tokens": 205310001.0, - "reward": 1.091796875, - "reward_std": 0.3492361605167389, - "rewards/accuracy_reward/mean": 0.185546875, - "rewards/accuracy_reward/std": 0.38912075757980347, + "grad_norm": 0.13200511038303375, + "kl": 0.028533935546875, + "learning_rate": 9.999843402432097e-07, + "loss": 0.2077, + "num_tokens": 233852039.0, + "reward": 1.017578125, + "reward_std": 0.362113893032074, + "rewards/accuracy_reward/mean": 0.166015625, + "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.22502446174621582, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.2716865837574005, "step": 301 }, { @@ -8744,27 +8744,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.130859375, + "completions/clipped_ratio": 0.2109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1101.455078125, - "completions/mean_terminated_length": 958.9415893554688, - "completions/min_length": 2.0, - "completions/min_terminated_length": 2.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1142.224609375, + "completions/mean_terminated_length": 900.0866088867188, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, "epoch": 0.10309806264402151, - "grad_norm": 0.13499902188777924, - "kl": 0.03302001953125, - "learning_rate": 9.999741331906542e-07, - "loss": 0.1448, - "num_tokens": 205951834.0, - "reward": 1.00927734375, - "reward_std": 0.28933781385421753, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, + "grad_norm": 0.14424169063568115, + "kl": 0.033843994140625, + "learning_rate": 9.999795464764258e-07, + "loss": 0.1977, + "num_tokens": 234514746.0, + "reward": 0.91845703125, + "reward_std": 0.3266125023365021, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88623046875, - "rewards/tag_count_reward/std": 0.23949728906154633, + "rewards/tag_count_reward/mean": 0.81103515625, + "rewards/tag_count_reward/std": 0.3001309335231781, "step": 302 }, { @@ -8773,27 +8773,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1328125, + "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1156.25390625, - "completions/mean_terminated_length": 1019.68017578125, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1158.349609375, + "completions/mean_terminated_length": 931.5759887695312, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.1034394469574123, - "grad_norm": 0.1338455229997635, - "kl": 0.0301513671875, - "learning_rate": 9.99968065739239e-07, - "loss": 0.1571, - "num_tokens": 206628780.0, - "reward": 1.00341796875, - "reward_std": 0.2902722954750061, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, + "grad_norm": 0.14931531250476837, + "kl": 0.026947021484375, + "learning_rate": 9.99974113561316e-07, + "loss": 0.2459, + "num_tokens": 235192765.0, + "reward": 0.91748046875, + "reward_std": 0.322037935256958, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88623046875, - "rewards/tag_count_reward/std": 0.23485609889030457, + "rewards/tag_count_reward/mean": 0.81787109375, + "rewards/tag_count_reward/std": 0.29906564950942993, "step": 303 }, { @@ -8802,27 +8802,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.14453125, + "completions/clipped_ratio": 0.19921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1173.71875, - "completions/mean_terminated_length": 1026.009033203125, - "completions/min_length": 196.0, - "completions/min_terminated_length": 196.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1137.58984375, + "completions/mean_terminated_length": 911.0975341796875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.1037808312708031, - "grad_norm": 0.1281381994485855, - "kl": 0.029388427734375, - "learning_rate": 9.999613596404544e-07, - "loss": 0.1509, - "num_tokens": 207305836.0, - "reward": 1.00146484375, - "reward_std": 0.3014447093009949, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, + "grad_norm": 0.15357358753681183, + "kl": 0.02783203125, + "learning_rate": 9.999680415055969e-07, + "loss": 0.165, + "num_tokens": 235851323.0, + "reward": 0.94580078125, + "reward_std": 0.3047916889190674, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88818359375, - "rewards/tag_count_reward/std": 0.2399078905582428, + "rewards/tag_count_reward/mean": 0.83642578125, + "rewards/tag_count_reward/std": 0.280613511800766, "step": 304 }, { @@ -8831,27 +8831,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.154296875, + "completions/clipped_ratio": 0.24609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1184.798828125, - "completions/mean_terminated_length": 1027.3094482421875, - "completions/min_length": 220.0, - "completions/min_terminated_length": 220.0, + "completions/mean_length": 1263.966796875, + "completions/mean_terminated_length": 1008.038818359375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.1041222155841939, - "grad_norm": 0.10877677798271179, - "kl": 0.028076171875, - "learning_rate": 9.999540149038193e-07, - "loss": 0.0941, - "num_tokens": 207985909.0, - "reward": 1.03955078125, - "reward_std": 0.26682427525520325, - "rewards/accuracy_reward/mean": 0.150390625, - "rewards/accuracy_reward/std": 0.35780346393585205, + "grad_norm": 0.8344973921775818, + "kl": 0.035736083984375, + "learning_rate": 9.999613303178934e-07, + "loss": 0.1767, + "num_tokens": 236571930.0, + "reward": 0.9365234375, + "reward_std": 0.34821170568466187, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88916015625, - "rewards/tag_count_reward/std": 0.24339549243450165, + "rewards/tag_count_reward/mean": 0.7880859375, + "rewards/tag_count_reward/std": 0.30928972363471985, "step": 305 }, { @@ -8860,27 +8860,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.103515625, + "completions/clipped_ratio": 0.19921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1093.181640625, - "completions/mean_terminated_length": 982.9302368164062, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1114.48828125, + "completions/mean_terminated_length": 882.248779296875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.1044635998975847, - "grad_norm": 0.13633041083812714, - "kl": 0.03131103515625, - "learning_rate": 9.999460315397577e-07, - "loss": 0.1139, - "num_tokens": 208629042.0, - "reward": 1.0380859375, - "reward_std": 0.27025923132896423, - "rewards/accuracy_reward/mean": 0.134765625, - "rewards/accuracy_reward/std": 0.3418070077896118, + "grad_norm": 0.13022573292255402, + "kl": 0.029266357421875, + "learning_rate": 9.999539800077384e-07, + "loss": 0.1916, + "num_tokens": 237225972.0, + "reward": 0.97021484375, + "reward_std": 0.35741788148880005, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9033203125, - "rewards/tag_count_reward/std": 0.2226831316947937, + "rewards/tag_count_reward/mean": 0.82568359375, + "rewards/tag_count_reward/std": 0.28755927085876465, "step": 306 }, { @@ -8889,27 +8889,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.162109375, + "completions/clipped_ratio": 0.26171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 1133.87890625, - "completions/mean_terminated_length": 957.02099609375, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1174.61328125, + "completions/mean_terminated_length": 865.0, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.1048049842109755, - "grad_norm": 0.12762504816055298, - "kl": 0.03314208984375, - "learning_rate": 9.999374095596004e-07, - "loss": 0.1499, - "num_tokens": 209281860.0, - "reward": 0.970703125, - "reward_std": 0.27441883087158203, - "rewards/accuracy_reward/mean": 0.10000000149011612, - "rewards/accuracy_reward/std": 0.30031299591064453, + "grad_norm": 0.13066816329956055, + "kl": 0.028778076171875, + "learning_rate": 9.999459905855716e-07, + "loss": 0.1739, + "num_tokens": 237899646.0, + "reward": 0.86865234375, + "reward_std": 0.3209155797958374, + "rewards/accuracy_reward/mean": 0.0833333358168602, + "rewards/accuracy_reward/std": 0.2766737639904022, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.876953125, - "rewards/tag_count_reward/std": 0.25266891717910767, + "rewards/tag_count_reward/mean": 0.79052734375, + "rewards/tag_count_reward/std": 0.3111482262611389, "step": 307 }, { @@ -8918,27 +8918,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.130859375, + "completions/clipped_ratio": 0.20703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1178.130859375, - "completions/mean_terminated_length": 1047.161865234375, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1208.798828125, + "completions/mean_terminated_length": 989.697021484375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.1051463685243663, - "grad_norm": 0.12441974133253098, - "kl": 0.03045654296875, - "learning_rate": 9.99928148975585e-07, - "loss": 0.1385, - "num_tokens": 209965031.0, - "reward": 1.0419921875, - "reward_std": 0.27181071043014526, - "rewards/accuracy_reward/mean": 0.142578125, - "rewards/accuracy_reward/std": 0.3499840497970581, + "grad_norm": 0.18847060203552246, + "kl": 0.03399658203125, + "learning_rate": 9.999373620627412e-07, + "loss": 0.2203, + "num_tokens": 238598519.0, + "reward": 0.9638671875, + "reward_std": 0.3631572127342224, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8994140625, - "rewards/tag_count_reward/std": 0.22695045173168182, + "rewards/tag_count_reward/mean": 0.8154296875, + "rewards/tag_count_reward/std": 0.30103573203086853, "step": 308 }, { @@ -8947,27 +8947,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08984375, + "completions/clipped_ratio": 0.177734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 1064.421875, - "completions/mean_terminated_length": 967.3304443359375, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1102.20703125, + "completions/mean_terminated_length": 897.7720336914062, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.1054877528377571, - "grad_norm": 0.1263512820005417, - "kl": 0.030517578125, - "learning_rate": 9.99918249800855e-07, - "loss": 0.1083, - "num_tokens": 210586767.0, - "reward": 1.08251953125, - "reward_std": 0.2911805808544159, - "rewards/accuracy_reward/mean": 0.166015625, - "rewards/accuracy_reward/std": 0.3724585771560669, + "grad_norm": 0.12446834146976471, + "kl": 0.025115966796875, + "learning_rate": 9.999280944515035e-07, + "loss": 0.1647, + "num_tokens": 239239601.0, + "reward": 0.98974609375, + "reward_std": 0.32373538613319397, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.20804768800735474, + "rewards/tag_count_reward/mean": 0.84130859375, + "rewards/tag_count_reward/std": 0.27905985713005066, "step": 309 }, { @@ -8976,27 +8976,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.078125, + "completions/clipped_ratio": 0.134765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1017.6171875, - "completions/mean_terminated_length": 930.296630859375, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1039.314453125, + "completions/mean_terminated_length": 882.2054443359375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.1058291371511479, - "grad_norm": 0.1431308537721634, - "kl": 0.0330810546875, - "learning_rate": 9.999077120494608e-07, - "loss": 0.1317, - "num_tokens": 211180107.0, - "reward": 1.08154296875, - "reward_std": 0.2786792814731598, - "rewards/accuracy_reward/mean": 0.16015625, - "rewards/accuracy_reward/std": 0.3671095669269562, + "grad_norm": 0.137374609708786, + "kl": 0.028167724609375, + "learning_rate": 9.99918187765022e-07, + "loss": 0.1498, + "num_tokens": 239844050.0, + "reward": 1.01611328125, + "reward_std": 0.31929969787597656, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.2040361613035202, + "rewards/tag_count_reward/mean": 0.85400390625, + "rewards/tag_count_reward/std": 0.2608667314052582, "step": 310 }, { @@ -9005,27 +9005,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.115234375, + "completions/clipped_ratio": 0.15234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1982.0, - "completions/mean_length": 1058.0546875, - "completions/mean_terminated_length": 929.1213989257812, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1032.001953125, + "completions/mean_terminated_length": 849.4031982421875, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, "epoch": 0.1061705214645387, - "grad_norm": 0.14184510707855225, - "kl": 0.03277587890625, - "learning_rate": 9.998965357363583e-07, - "loss": 0.1255, - "num_tokens": 211800999.0, - "reward": 0.97998046875, - "reward_std": 0.2651514410972595, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, + "grad_norm": 0.14066371321678162, + "kl": 0.027435302734375, + "learning_rate": 9.99907642017368e-07, + "loss": 0.2044, + "num_tokens": 240451603.0, + "reward": 0.92724609375, + "reward_std": 0.28212085366249084, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.22504940629005432, + "rewards/tag_count_reward/mean": 0.85107421875, + "rewards/tag_count_reward/std": 0.2666451334953308, "step": 311 }, { @@ -9034,27 +9034,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.185546875, + "completions/clipped_ratio": 0.23828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1193.66796875, - "completions/mean_terminated_length": 999.0360107421875, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1233.1640625, + "completions/mean_terminated_length": 978.2667236328125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, "epoch": 0.1065119057779295, - "grad_norm": 0.127924844622612, - "kl": 0.03619384765625, - "learning_rate": 9.998847208774107e-07, - "loss": 0.1751, - "num_tokens": 212486701.0, - "reward": 0.9443359375, - "reward_std": 0.3127998411655426, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, + "grad_norm": 0.12637880444526672, + "kl": 0.026824951171875, + "learning_rate": 9.998964572235205e-07, + "loss": 0.2305, + "num_tokens": 241157527.0, + "reward": 0.8857421875, + "reward_std": 0.34094083309173584, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8525390625, - "rewards/tag_count_reward/std": 0.2726678252220154, + "rewards/tag_count_reward/mean": 0.7763671875, + "rewards/tag_count_reward/std": 0.31208235025405884, "step": 312 }, { @@ -9063,27 +9063,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.146484375, + "completions/clipped_ratio": 0.1796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1163.001953125, - "completions/mean_terminated_length": 1011.1143798828125, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1156.83203125, + "completions/mean_terminated_length": 961.6238403320312, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, "epoch": 0.1068532900913203, - "grad_norm": 0.12907107174396515, - "kl": 0.031494140625, - "learning_rate": 9.998722674893869e-07, - "loss": 0.1676, - "num_tokens": 213154302.0, - "reward": 1.0478515625, - "reward_std": 0.3261502981185913, - "rewards/accuracy_reward/mean": 0.16015625, - "rewards/accuracy_reward/std": 0.3671095669269562, + "grad_norm": 0.2379995584487915, + "kl": 0.0279541015625, + "learning_rate": 9.998846333993667e-07, + "loss": 0.2032, + "num_tokens": 241821969.0, + "reward": 0.99072265625, + "reward_std": 0.3368230164051056, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8876953125, - "rewards/tag_count_reward/std": 0.24095164239406586, + "rewards/tag_count_reward/mean": 0.83447265625, + "rewards/tag_count_reward/std": 0.28164342045783997, "step": 313 }, { @@ -9092,27 +9092,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.13671875, + "completions/clipped_ratio": 0.23046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 1181.607421875, - "completions/mean_terminated_length": 1044.39599609375, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1211.44140625, + "completions/mean_terminated_length": 960.8984375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.1071946744047111, - "grad_norm": 0.12877912819385529, - "kl": 0.0335693359375, - "learning_rate": 9.99859175589962e-07, - "loss": 0.1381, - "num_tokens": 213838085.0, - "reward": 1.015625, - "reward_std": 0.317222535610199, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, + "grad_norm": 0.12838341295719147, + "kl": 0.024261474609375, + "learning_rate": 9.99872170561701e-07, + "loss": 0.2032, + "num_tokens": 242521027.0, + "reward": 0.9482421875, + "reward_std": 0.37479346990585327, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.888671875, - "rewards/tag_count_reward/std": 0.23885856568813324, + "rewards/tag_count_reward/mean": 0.8095703125, + "rewards/tag_count_reward/std": 0.29570674896240234, "step": 314 }, { @@ -9121,27 +9121,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.142578125, + "completions/clipped_ratio": 0.181640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1161.228515625, - "completions/mean_terminated_length": 1013.7699584960938, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1167.080078125, + "completions/mean_terminated_length": 971.5537109375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, "epoch": 0.1075360587181019, - "grad_norm": 0.2052370309829712, - "kl": 0.034515380859375, - "learning_rate": 9.998454451977178e-07, - "loss": 0.1532, - "num_tokens": 214508122.0, - "reward": 0.9892578125, - "reward_std": 0.29635995626449585, - "rewards/accuracy_reward/mean": 0.10080645233392715, - "rewards/accuracy_reward/std": 0.30137622356414795, + "grad_norm": 0.15141969919204712, + "kl": 0.02655029296875, + "learning_rate": 9.99859068728225e-07, + "loss": 0.1864, + "num_tokens": 243194060.0, + "reward": 0.96044921875, + "reward_std": 0.3245297372341156, + "rewards/accuracy_reward/mean": 0.12298387289047241, + "rewards/accuracy_reward/std": 0.32875028252601624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8916015625, - "rewards/tag_count_reward/std": 0.2345370054244995, + "rewards/tag_count_reward/mean": 0.84130859375, + "rewards/tag_count_reward/std": 0.27641761302948, "step": 315 }, { @@ -9150,27 +9150,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.14453125, + "completions/clipped_ratio": 0.16796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 1156.36328125, - "completions/mean_terminated_length": 1005.721435546875, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1121.447265625, + "completions/mean_terminated_length": 934.396728515625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.1078774430314927, - "grad_norm": 0.1357164978981018, - "kl": 0.03118896484375, - "learning_rate": 9.99831076332142e-07, - "loss": 0.2021, - "num_tokens": 215176228.0, - "reward": 0.96875, - "reward_std": 0.27862483263015747, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, + "grad_norm": 0.12909026443958282, + "kl": 0.0235595703125, + "learning_rate": 9.998453279175492e-07, + "loss": 0.2343, + "num_tokens": 243844289.0, + "reward": 0.93115234375, + "reward_std": 0.30712559819221497, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.888671875, - "rewards/tag_count_reward/std": 0.2419114112854004, + "rewards/tag_count_reward/mean": 0.83935546875, + "rewards/tag_count_reward/std": 0.268537700176239, "step": 316 }, { @@ -9179,27 +9179,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1328125, + "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 1110.935546875, - "completions/mean_terminated_length": 967.4212036132812, - "completions/min_length": 201.0, - "completions/min_terminated_length": 201.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1136.208984375, + "completions/mean_terminated_length": 925.7957153320312, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.1082188273448835, - "grad_norm": 0.7026339173316956, - "kl": 0.042694091796875, - "learning_rate": 9.998160690136289e-07, - "loss": 0.133, - "num_tokens": 215820387.0, - "reward": 1.03515625, - "reward_std": 0.31893688440322876, - "rewards/accuracy_reward/mean": 0.13671875, - "rewards/accuracy_reward/std": 0.3438861668109894, + "grad_norm": 0.12713587284088135, + "kl": 0.0272216796875, + "learning_rate": 9.998309481491906e-07, + "loss": 0.1886, + "num_tokens": 244501388.0, + "reward": 0.9765625, + "reward_std": 0.36434584856033325, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8984375, - "rewards/tag_count_reward/std": 0.2318510115146637, + "rewards/tag_count_reward/mean": 0.822265625, + "rewards/tag_count_reward/std": 0.2830888330936432, "step": 317 }, { @@ -9208,27 +9208,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08984375, + "completions/clipped_ratio": 0.146484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1983.0, - "completions/mean_length": 938.87109375, - "completions/mean_terminated_length": 829.3862915039062, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 960.869140625, + "completions/mean_terminated_length": 774.2905883789062, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, "epoch": 0.1085602116582743, - "grad_norm": 5.583955764770508, - "kl": 0.05126953125, - "learning_rate": 9.998004232634777e-07, - "loss": 0.1829, - "num_tokens": 216372337.0, - "reward": 1.052734375, - "reward_std": 0.2629846930503845, - "rewards/accuracy_reward/mean": 0.13104838132858276, - "rewards/accuracy_reward/std": 0.3377939760684967, + "grad_norm": 0.16936686635017395, + "kl": 0.029144287109375, + "learning_rate": 9.998159294435742e-07, + "loss": 0.1699, + "num_tokens": 245064601.0, + "reward": 1.005859375, + "reward_std": 0.3112642168998718, + "rewards/accuracy_reward/mean": 0.15322580933570862, + "rewards/accuracy_reward/std": 0.36056873202323914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.20238155126571655, + "rewards/tag_count_reward/mean": 0.857421875, + "rewards/tag_count_reward/std": 0.26345139741897583, "step": 318 }, { @@ -9237,27 +9237,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.138671875, + "completions/clipped_ratio": 0.18359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1135.80078125, - "completions/mean_terminated_length": 988.9387817382812, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1139.046875, + "completions/mean_terminated_length": 934.64111328125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.1089015959716651, - "grad_norm": 0.16040125489234924, - "kl": 0.03277587890625, - "learning_rate": 9.997841391038957e-07, - "loss": 0.1377, - "num_tokens": 217024795.0, - "reward": 1.017578125, - "reward_std": 0.31150808930397034, - "rewards/accuracy_reward/mean": 0.13508065044879913, - "rewards/accuracy_reward/std": 0.3421548008918762, + "grad_norm": 0.14525948464870453, + "kl": 0.025726318359375, + "learning_rate": 9.998002718220323e-07, + "loss": 0.1549, + "num_tokens": 245718721.0, + "reward": 0.9501953125, + "reward_std": 0.33043381571769714, + "rewards/accuracy_reward/mean": 0.1391129046678543, + "rewards/accuracy_reward/std": 0.34641367197036743, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88671875, - "rewards/tag_count_reward/std": 0.2410012036561966, + "rewards/tag_count_reward/mean": 0.8154296875, + "rewards/tag_count_reward/std": 0.2834581732749939, "step": 319 }, { @@ -9266,27 +9266,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.078125, + "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1056.83984375, - "completions/mean_terminated_length": 972.8432006835938, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1048.458984375, + "completions/mean_terminated_length": 884.897705078125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.1092429802850559, - "grad_norm": 0.13190127909183502, - "kl": 0.03271484375, - "learning_rate": 9.997672165579948e-07, - "loss": 0.1314, - "num_tokens": 217648313.0, - "reward": 0.99755859375, - "reward_std": 0.2191423624753952, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, + "grad_norm": 0.14780759811401367, + "kl": 0.025604248046875, + "learning_rate": 9.997839753068054e-07, + "loss": 0.2192, + "num_tokens": 246337948.0, + "reward": 0.947265625, + "reward_std": 0.2669388949871063, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.20040719211101532, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.25815349817276, "step": 320 }, { @@ -9295,27 +9295,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.134765625, + "completions/clipped_ratio": 0.2265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1155.67578125, - "completions/mean_terminated_length": 1016.6907958984375, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1203.94921875, + "completions/mean_terminated_length": 956.7020263671875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.1095843645984467, - "grad_norm": 0.12830433249473572, - "kl": 0.03466796875, - "learning_rate": 9.997496556497934e-07, - "loss": 0.1481, - "num_tokens": 218315059.0, - "reward": 1.048828125, - "reward_std": 0.3038339912891388, - "rewards/accuracy_reward/mean": 0.150390625, - "rewards/accuracy_reward/std": 0.35780346393585205, + "grad_norm": 0.12179320305585861, + "kl": 0.026092529296875, + "learning_rate": 9.997670399210405e-07, + "loss": 0.1962, + "num_tokens": 247029410.0, + "reward": 0.955078125, + "reward_std": 0.3568507432937622, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8984375, - "rewards/tag_count_reward/std": 0.2344738394021988, + "rewards/tag_count_reward/mean": 0.798828125, + "rewards/tag_count_reward/std": 0.2968500554561615, "step": 321 }, { @@ -9324,27 +9324,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.087890625, + "completions/clipped_ratio": 0.11328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 1029.49609375, - "completions/mean_terminated_length": 931.3533325195312, - "completions/min_length": 267.0, - "completions/min_terminated_length": 267.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1039.693359375, + "completions/mean_terminated_length": 910.8788452148438, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.1099257489118375, - "grad_norm": 0.24386551976203918, - "kl": 0.03375244140625, - "learning_rate": 9.997314564042165e-07, - "loss": 0.1044, - "num_tokens": 218913713.0, - "reward": 1.0634765625, - "reward_std": 0.2716171443462372, - "rewards/accuracy_reward/mean": 0.14453125, - "rewards/accuracy_reward/std": 0.35197147727012634, + "grad_norm": 0.1347188800573349, + "kl": 0.02587890625, + "learning_rate": 9.997494656887927e-07, + "loss": 0.1647, + "num_tokens": 247633285.0, + "reward": 1.01416015625, + "reward_std": 0.31157585978507996, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.2015654444694519, + "rewards/tag_count_reward/mean": 0.88330078125, + "rewards/tag_count_reward/std": 0.23601685464382172, "step": 322 }, { @@ -9353,27 +9353,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12890625, + "completions/clipped_ratio": 0.162109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1173.41796875, - "completions/mean_terminated_length": 1043.99560546875, - "completions/min_length": 218.0, - "completions/min_terminated_length": 218.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1161.80859375, + "completions/mean_terminated_length": 990.3543090820312, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, "epoch": 0.1102671332252283, - "grad_norm": 0.13542306423187256, - "kl": 0.03369140625, - "learning_rate": 9.997126188470941e-07, - "loss": 0.1083, - "num_tokens": 219593111.0, - "reward": 1.0234375, - "reward_std": 0.28221234679222107, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, + "grad_norm": 0.12787990272045135, + "kl": 0.024627685546875, + "learning_rate": 9.997312526350242e-07, + "loss": 0.1688, + "num_tokens": 248306739.0, + "reward": 0.974609375, + "reward_std": 0.33962225914001465, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.900390625, - "rewards/tag_count_reward/std": 0.22952312231063843, + "rewards/tag_count_reward/mean": 0.853515625, + "rewards/tag_count_reward/std": 0.25941628217697144, "step": 323 }, { @@ -9382,27 +9382,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.138671875, + "completions/clipped_ratio": 0.251953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1177.892578125, - "completions/mean_terminated_length": 1037.8072509765625, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1231.490234375, + "completions/mean_terminated_length": 956.4778442382812, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, "epoch": 0.1106085175386191, - "grad_norm": 0.20564241707324982, - "kl": 0.035614013671875, - "learning_rate": 9.996931430051626e-07, - "loss": 0.1422, - "num_tokens": 220286640.0, - "reward": 0.95263671875, - "reward_std": 0.2707866132259369, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, + "grad_norm": 0.11818953603506088, + "kl": 0.0235595703125, + "learning_rate": 9.997124007856049e-07, + "loss": 0.1793, + "num_tokens": 249027710.0, + "reward": 0.84423828125, + "reward_std": 0.3030323386192322, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89013671875, - "rewards/tag_count_reward/std": 0.2497853934764862, + "rewards/tag_count_reward/mean": 0.78369140625, + "rewards/tag_count_reward/std": 0.3146965205669403, "step": 324 }, { @@ -9411,27 +9411,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.07421875, + "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 1053.9921875, - "completions/mean_terminated_length": 974.3037719726562, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1041.869140625, + "completions/mean_terminated_length": 887.7770385742188, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.1109499018520099, - "grad_norm": 0.15150968730449677, - "kl": 0.036376953125, - "learning_rate": 9.99673028906065e-07, - "loss": 0.1305, - "num_tokens": 220895260.0, - "reward": 1.10986328125, - "reward_std": 0.28572919964790344, - "rewards/accuracy_reward/mean": 0.177734375, - "rewards/accuracy_reward/std": 0.3826628625392914, + "grad_norm": 0.15102706849575043, + "kl": 0.02838134765625, + "learning_rate": 9.996929101673117e-07, + "loss": 0.1679, + "num_tokens": 249630123.0, + "reward": 1.0576171875, + "reward_std": 0.33777889609336853, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.18809467554092407, + "rewards/tag_count_reward/mean": 0.8701171875, + "rewards/tag_count_reward/std": 0.24625489115715027, "step": 325 }, { @@ -9440,27 +9440,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.123046875, + "completions/clipped_ratio": 0.24609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1158.326171875, - "completions/mean_terminated_length": 1033.4945068359375, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1215.65625, + "completions/mean_terminated_length": 943.95849609375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, "epoch": 0.1112912861654007, - "grad_norm": 0.14217494428157806, - "kl": 0.032989501953125, - "learning_rate": 9.996522765783488e-07, - "loss": 0.1176, - "num_tokens": 221562723.0, - "reward": 1.03955078125, - "reward_std": 0.2918170392513275, - "rewards/accuracy_reward/mean": 0.13671875, - "rewards/accuracy_reward/std": 0.3438861668109894, + "grad_norm": 0.3314480781555176, + "kl": 0.030853271484375, + "learning_rate": 9.996727808078292e-07, + "loss": 0.1608, + "num_tokens": 250326939.0, + "reward": 0.96728515625, + "reward_std": 0.32705235481262207, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90283203125, - "rewards/tag_count_reward/std": 0.22219502925872803, + "rewards/tag_count_reward/mean": 0.80517578125, + "rewards/tag_count_reward/std": 0.29922857880592346, "step": 326 }, { @@ -9469,27 +9469,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0859375, + "completions/clipped_ratio": 0.14453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1088.98046875, - "completions/mean_terminated_length": 998.8162841796875, - "completions/min_length": 236.0, - "completions/min_terminated_length": 236.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1098.333984375, + "completions/mean_terminated_length": 937.8880615234375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, "epoch": 0.1116326704787915, - "grad_norm": 0.28022122383117676, - "kl": 0.033935546875, - "learning_rate": 9.996308860514686e-07, - "loss": 0.0746, - "num_tokens": 222194681.0, - "reward": 1.142578125, - "reward_std": 0.33608290553092957, - "rewards/accuracy_reward/mean": 0.21484375, - "rewards/accuracy_reward/std": 0.4111155867576599, + "grad_norm": 0.12496457993984222, + "kl": 0.02447509765625, + "learning_rate": 9.996520127357488e-07, + "loss": 0.1215, + "num_tokens": 250963686.0, + "reward": 1.1142578125, + "reward_std": 0.38647955656051636, + "rewards/accuracy_reward/mean": 0.248046875, + "rewards/accuracy_reward/std": 0.4323015511035919, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.1880800724029541, + "rewards/tag_count_reward/mean": 0.8662109375, + "rewards/tag_count_reward/std": 0.2456488162279129, "step": 327 }, { @@ -9498,27 +9498,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.111328125, + "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1109.802734375, - "completions/mean_terminated_length": 992.2703857421875, - "completions/min_length": 216.0, - "completions/min_terminated_length": 216.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1126.291015625, + "completions/mean_terminated_length": 942.8126220703125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, "epoch": 0.1119740547921823, - "grad_norm": 0.18140637874603271, - "kl": 0.03533935546875, - "learning_rate": 9.996088573557843e-07, - "loss": 0.1316, - "num_tokens": 222838068.0, - "reward": 0.9814453125, - "reward_std": 0.2408694475889206, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, + "grad_norm": 0.13243867456912994, + "kl": 0.026123046875, + "learning_rate": 9.996306059805693e-07, + "loss": 0.1773, + "num_tokens": 251615515.0, + "reward": 0.9365234375, + "reward_std": 0.3187788426876068, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.22542962431907654, + "rewards/tag_count_reward/mean": 0.8388671875, + "rewards/tag_count_reward/std": 0.27299004793167114, "step": 328 }, { @@ -9527,27 +9527,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.099609375, + "completions/clipped_ratio": 0.15234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1082.96484375, - "completions/mean_terminated_length": 976.2039184570312, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1105.86328125, + "completions/mean_terminated_length": 936.5391845703125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, "epoch": 0.1123154391055731, - "grad_norm": 0.13812144100666046, - "kl": 0.03448486328125, - "learning_rate": 9.995861905225617e-07, - "loss": 0.1212, - "num_tokens": 223464770.0, - "reward": 1.1123046875, - "reward_std": 0.27613845467567444, - "rewards/accuracy_reward/mean": 0.19140625, - "rewards/accuracy_reward/std": 0.3937928080558777, + "grad_norm": 0.13291388750076294, + "kl": 0.024688720703125, + "learning_rate": 9.99608560572697e-07, + "loss": 0.1792, + "num_tokens": 252253941.0, + "reward": 1.037109375, + "reward_std": 0.34071582555770874, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.20474500954151154, + "rewards/tag_count_reward/mean": 0.849609375, + "rewards/tag_count_reward/std": 0.2618798613548279, "step": 329 }, { @@ -9556,27 +9556,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.14453125, + "completions/clipped_ratio": 0.228515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 1178.630859375, - "completions/mean_terminated_length": 1031.7510986328125, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1203.212890625, + "completions/mean_terminated_length": 952.9848022460938, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.1126568234189639, - "grad_norm": 0.3180902898311615, - "kl": 0.03668212890625, - "learning_rate": 9.995628855839721e-07, - "loss": 0.1542, - "num_tokens": 224145125.0, - "reward": 0.95361328125, - "reward_std": 0.2629111409187317, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, + "grad_norm": 0.12625975906848907, + "kl": 0.0262451171875, + "learning_rate": 9.995858765434448e-07, + "loss": 0.2066, + "num_tokens": 252946882.0, + "reward": 0.8818359375, + "reward_std": 0.3105853796005249, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88134765625, - "rewards/tag_count_reward/std": 0.23865395784378052, + "rewards/tag_count_reward/mean": 0.8037109375, + "rewards/tag_count_reward/std": 0.2996996343135834, "step": 330 }, { @@ -9585,27 +9585,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.130859375, + "completions/clipped_ratio": 0.19921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 1097.04296875, - "completions/mean_terminated_length": 953.8651733398438, - "completions/min_length": 234.0, - "completions/min_terminated_length": 234.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1113.40234375, + "completions/mean_terminated_length": 880.8927001953125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, "epoch": 0.1129982077323547, - "grad_norm": 0.41690030694007874, - "kl": 0.04473876953125, - "learning_rate": 9.995389425730923e-07, - "loss": 0.1594, - "num_tokens": 224789451.0, - "reward": 0.98193359375, - "reward_std": 0.287788987159729, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, + "grad_norm": 0.28922170400619507, + "kl": 0.033233642578125, + "learning_rate": 9.995625539250332e-07, + "loss": 0.2163, + "num_tokens": 253599584.0, + "reward": 0.93017578125, + "reward_std": 0.34205788373947144, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89404296875, - "rewards/tag_count_reward/std": 0.2343510389328003, + "rewards/tag_count_reward/mean": 0.82861328125, + "rewards/tag_count_reward/std": 0.28591638803482056, "step": 331 }, { @@ -9614,27 +9614,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.171875, + "completions/clipped_ratio": 0.21484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1192.52734375, - "completions/mean_terminated_length": 1014.9764404296875, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1161.751953125, + "completions/mean_terminated_length": 919.2462768554688, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, "epoch": 0.1133395920457455, - "grad_norm": 59.31746292114258, - "kl": 0.782470703125, - "learning_rate": 9.995143615239056e-07, - "loss": 0.2046, - "num_tokens": 225479369.0, - "reward": 0.97314453125, - "reward_std": 0.2881585955619812, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, + "grad_norm": 0.17107222974300385, + "kl": 0.029327392578125, + "learning_rate": 9.995385927505893e-07, + "loss": 0.2005, + "num_tokens": 254273745.0, + "reward": 0.91259765625, + "reward_std": 0.3472982347011566, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86962890625, - "rewards/tag_count_reward/std": 0.2583639323711395, + "rewards/tag_count_reward/mean": 0.80517578125, + "rewards/tag_count_reward/std": 0.3000449538230896, "step": 332 }, { @@ -9643,27 +9643,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.095703125, + "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 1073.884765625, - "completions/mean_terminated_length": 970.7926025390625, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1103.44921875, + "completions/mean_terminated_length": 907.410400390625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.1136809763591363, - "grad_norm": 0.3465381860733032, - "kl": 0.046630859375, - "learning_rate": 9.994891424712998e-07, - "loss": 0.1354, - "num_tokens": 226107262.0, - "reward": 1.03369140625, - "reward_std": 0.2943110466003418, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, + "grad_norm": 0.13893826305866241, + "kl": 0.027740478515625, + "learning_rate": 9.995139930541476e-07, + "loss": 0.1546, + "num_tokens": 254916775.0, + "reward": 0.9716796875, + "reward_std": 0.30238959193229675, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.21345669031143188, + "rewards/tag_count_reward/mean": 0.8388671875, + "rewards/tag_count_reward/std": 0.27962982654571533, "step": 333 }, { @@ -9672,27 +9672,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.095703125, + "completions/clipped_ratio": 0.12890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 1059.50390625, - "completions/mean_terminated_length": 954.8898315429688, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1041.51953125, + "completions/mean_terminated_length": 892.5784912109375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, "epoch": 0.11402236067252709, - "grad_norm": 0.44690296053886414, - "kl": 0.0445556640625, - "learning_rate": 9.99463285451069e-07, - "loss": 0.1222, - "num_tokens": 226726960.0, - "reward": 1.07568359375, - "reward_std": 0.2955414354801178, - "rewards/accuracy_reward/mean": 0.146484375, - "rewards/accuracy_reward/std": 0.35393697023391724, + "grad_norm": 0.15418261289596558, + "kl": 0.028228759765625, + "learning_rate": 9.994887548706493e-07, + "loss": 0.1576, + "num_tokens": 255527265.0, + "reward": 1.03564453125, + "reward_std": 0.36193418502807617, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.19280590116977692, + "rewards/tag_count_reward/mean": 0.85595703125, + "rewards/tag_count_reward/std": 0.26195237040519714, "step": 334 }, { @@ -9701,27 +9701,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.087890625, + "completions/clipped_ratio": 0.130859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1074.240234375, - "completions/mean_terminated_length": 980.4089965820312, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 987.32421875, + "completions/mean_terminated_length": 827.626953125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, "epoch": 0.11436374498591789, - "grad_norm": 0.4894280433654785, - "kl": 0.044921875, - "learning_rate": 9.994367904999127e-07, - "loss": 0.1044, - "num_tokens": 227353979.0, - "reward": 1.08251953125, - "reward_std": 0.3042677938938141, - "rewards/accuracy_reward/mean": 0.16015625, - "rewards/accuracy_reward/std": 0.3671095669269562, + "grad_norm": 0.16910158097743988, + "kl": 0.03070068359375, + "learning_rate": 9.994628782359422e-07, + "loss": 0.1612, + "num_tokens": 256109783.0, + "reward": 1.083984375, + "reward_std": 0.34096887707710266, + "rewards/accuracy_reward/mean": 0.20703125, + "rewards/accuracy_reward/std": 0.40557438135147095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.2020028978586197, + "rewards/tag_count_reward/mean": 0.876953125, + "rewards/tag_count_reward/std": 0.23975300788879395, "step": 335 }, { @@ -9730,27 +9730,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1328125, + "completions/clipped_ratio": 0.16015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1173.904296875, - "completions/mean_terminated_length": 1040.0338134765625, - "completions/min_length": 255.0, - "completions/min_terminated_length": 255.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1143.517578125, + "completions/mean_terminated_length": 971.0348510742188, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, "epoch": 0.11470512929930869, - "grad_norm": 15565.7607421875, - "kl": 145.09112548828125, - "learning_rate": 9.994096576554353e-07, - "loss": 5.8975, - "num_tokens": 228028906.0, - "reward": 1.11474609375, - "reward_std": 0.34920042753219604, - "rewards/accuracy_reward/mean": 0.216796875, - "rewards/accuracy_reward/std": 0.4124660789966583, + "grad_norm": 0.1310206651687622, + "kl": 0.02618408203125, + "learning_rate": 9.99436363186782e-07, + "loss": 0.1408, + "num_tokens": 256769152.0, + "reward": 1.09814453125, + "reward_std": 0.3855630159378052, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.43343618512153625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89794921875, - "rewards/tag_count_reward/std": 0.24171333014965057, + "rewards/tag_count_reward/mean": 0.84814453125, + "rewards/tag_count_reward/std": 0.26266634464263916, "step": 336 }, { @@ -9759,27 +9759,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.142578125, + "completions/clipped_ratio": 0.185546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1165.974609375, - "completions/mean_terminated_length": 1019.3052978515625, - "completions/min_length": 236.0, - "completions/min_terminated_length": 236.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1126.822265625, + "completions/mean_terminated_length": 916.961669921875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, "epoch": 0.11504651361269949, - "grad_norm": 3.400146245956421, - "kl": 0.08160400390625, - "learning_rate": 9.993818869561467e-07, - "loss": 0.1191, - "num_tokens": 228702605.0, - "reward": 1.0546875, - "reward_std": 0.30855315923690796, - "rewards/accuracy_reward/mean": 0.17540322244167328, - "rewards/accuracy_reward/std": 0.3806955814361572, + "grad_norm": 0.13938570022583008, + "kl": 0.02838134765625, + "learning_rate": 9.994092097608302e-07, + "loss": 0.1767, + "num_tokens": 257422805.0, + "reward": 0.97998046875, + "reward_std": 0.3367058038711548, + "rewards/accuracy_reward/mean": 0.1572580635547638, + "rewards/accuracy_reward/std": 0.36441144347190857, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.884765625, - "rewards/tag_count_reward/std": 0.2380250245332718, + "rewards/tag_count_reward/mean": 0.82763671875, + "rewards/tag_count_reward/std": 0.2857559025287628, "step": 337 }, { @@ -9788,27 +9788,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.140625, + "completions/clipped_ratio": 0.11328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 1144.80859375, - "completions/mean_terminated_length": 997.0136108398438, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1021.044921875, + "completions/mean_terminated_length": 889.8479614257812, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.11538789792609029, - "grad_norm": 0.46814846992492676, - "kl": 0.04425048828125, - "learning_rate": 9.99353478441463e-07, - "loss": 0.1017, - "num_tokens": 229372635.0, - "reward": 1.00732421875, - "reward_std": 0.26241379976272583, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, + "grad_norm": 0.3287694752216339, + "kl": 0.029327392578125, + "learning_rate": 9.993814179966551e-07, + "loss": 0.1733, + "num_tokens": 258029468.0, + "reward": 0.99365234375, + "reward_std": 0.277240127325058, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89208984375, - "rewards/tag_count_reward/std": 0.23914992809295654, + "rewards/tag_count_reward/mean": 0.88818359375, + "rewards/tag_count_reward/std": 0.23475435376167297, "step": 338 }, { @@ -9817,27 +9817,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.126953125, + "completions/clipped_ratio": 0.18359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1107.2265625, - "completions/mean_terminated_length": 970.425048828125, - "completions/min_length": 296.0, - "completions/min_terminated_length": 296.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1084.23046875, + "completions/mean_terminated_length": 867.49755859375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, "epoch": 0.11572928223948109, - "grad_norm": 1.4173657894134521, - "kl": 0.06671142578125, - "learning_rate": 9.993244321517045e-07, - "loss": 0.1712, - "num_tokens": 230021071.0, - "reward": 1.05712890625, - "reward_std": 0.35515138506889343, - "rewards/accuracy_reward/mean": 0.154296875, - "rewards/accuracy_reward/std": 0.36158639192581177, + "grad_norm": 0.13999946415424347, + "kl": 0.02886962890625, + "learning_rate": 9.993529879337324e-07, + "loss": 0.22, + "num_tokens": 258666130.0, + "reward": 1.03125, + "reward_std": 0.397691547870636, + "rewards/accuracy_reward/mean": 0.18359375, + "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90283203125, - "rewards/tag_count_reward/std": 0.23503504693508148, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.27843984961509705, "step": 339 }, { @@ -9846,27 +9846,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.115234375, + "completions/clipped_ratio": 0.138671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 1103.939453125, - "completions/mean_terminated_length": 980.9823608398438, - "completions/min_length": 242.0, - "completions/min_terminated_length": 242.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1019.7421875, + "completions/mean_terminated_length": 854.1950073242188, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, "epoch": 0.11607066655287189, - "grad_norm": 0.6045756936073303, - "kl": 0.04412841796875, - "learning_rate": 9.99294748128097e-07, - "loss": 0.1306, - "num_tokens": 230658224.0, - "reward": 1.015625, - "reward_std": 0.25914958119392395, - "rewards/accuracy_reward/mean": 0.11491935700178146, - "rewards/accuracy_reward/std": 0.3192465901374817, + "grad_norm": 0.15282343327999115, + "kl": 0.02935791015625, + "learning_rate": 9.993239196124437e-07, + "loss": 0.1908, + "num_tokens": 259260174.0, + "reward": 1.0, + "reward_std": 0.3294396996498108, + "rewards/accuracy_reward/mean": 0.1411290317773819, + "rewards/accuracy_reward/std": 0.3485061228275299, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.904296875, - "rewards/tag_count_reward/std": 0.22310540080070496, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.2567282021045685, "step": 340 }, { @@ -9875,27 +9875,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1875, + "completions/clipped_ratio": 0.14453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1199.4140625, - "completions/mean_terminated_length": 1003.5865478515625, - "completions/min_length": 217.0, - "completions/min_terminated_length": 217.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1050.4765625, + "completions/mean_terminated_length": 881.9451904296875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, "epoch": 0.11641205086626269, - "grad_norm": 0.34343641996383667, - "kl": 0.0516357421875, - "learning_rate": 9.992644264127717e-07, - "loss": 0.1072, - "num_tokens": 231350276.0, - "reward": 1.06982421875, - "reward_std": 0.3374538719654083, - "rewards/accuracy_reward/mean": 0.21484375, - "rewards/accuracy_reward/std": 0.4111155867576599, + "grad_norm": 0.13595366477966309, + "kl": 0.027130126953125, + "learning_rate": 9.992942130740775e-07, + "loss": 0.1692, + "num_tokens": 259875970.0, + "reward": 1.09326171875, + "reward_std": 0.3595791459083557, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42402184009552, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85498046875, - "rewards/tag_count_reward/std": 0.2759782373905182, + "rewards/tag_count_reward/mean": 0.85888671875, + "rewards/tag_count_reward/std": 0.2635452449321747, "step": 341 }, { @@ -9904,27 +9904,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12890625, + "completions/clipped_ratio": 0.154296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1096.859375, - "completions/mean_terminated_length": 956.107666015625, - "completions/min_length": 228.0, - "completions/min_terminated_length": 228.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1004.53515625, + "completions/mean_terminated_length": 814.1570434570312, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, "epoch": 0.11675343517965349, - "grad_norm": 0.5945049524307251, - "kl": 0.03814697265625, - "learning_rate": 9.992334670487646e-07, - "loss": 0.1329, - "num_tokens": 231985548.0, - "reward": 1.0166015625, - "reward_std": 0.2711396813392639, - "rewards/accuracy_reward/mean": 0.11290322244167328, - "rewards/accuracy_reward/std": 0.3167939782142639, + "grad_norm": 0.15039843320846558, + "kl": 0.028045654296875, + "learning_rate": 9.99263868360829e-07, + "loss": 0.1846, + "num_tokens": 260463972.0, + "reward": 0.9951171875, + "reward_std": 0.3225647807121277, + "rewards/accuracy_reward/mean": 0.1411290317773819, + "rewards/accuracy_reward/std": 0.3485061228275299, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.21993711590766907, + "rewards/tag_count_reward/mean": 0.8583984375, + "rewards/tag_count_reward/std": 0.25598084926605225, "step": 342 }, { @@ -9933,27 +9933,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.109375, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1054.84375, - "completions/mean_terminated_length": 932.877197265625, - "completions/min_length": 226.0, - "completions/min_terminated_length": 226.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 990.525390625, + "completions/mean_terminated_length": 852.7969360351562, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.11709481949304429, - "grad_norm": 2.2203285694122314, - "kl": 0.07745361328125, - "learning_rate": 9.99201870080017e-07, - "loss": 0.1794, - "num_tokens": 232594364.0, - "reward": 0.99560546875, - "reward_std": 0.24448131024837494, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, + "grad_norm": 0.16409173607826233, + "kl": 0.031890869140625, + "learning_rate": 9.992328855157995e-07, + "loss": 0.2332, + "num_tokens": 261039857.0, + "reward": 0.98486328125, + "reward_std": 0.3110677897930145, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.21587374806404114, + "rewards/tag_count_reward/mean": 0.87548828125, + "rewards/tag_count_reward/std": 0.23976047337055206, "step": 343 }, { @@ -9962,27 +9962,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.146484375, + "completions/clipped_ratio": 0.189453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 1219.873046875, - "completions/mean_terminated_length": 1077.7459716796875, - "completions/min_length": 204.0, - "completions/min_terminated_length": 204.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1143.869140625, + "completions/mean_terminated_length": 932.5421752929688, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.11743620380643509, - "grad_norm": 0.4197451174259186, - "kl": 0.04046630859375, - "learning_rate": 9.99169635551375e-07, - "loss": 0.1429, - "num_tokens": 233295003.0, - "reward": 0.9736328125, - "reward_std": 0.26731252670288086, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, + "grad_norm": 0.13601191341876984, + "kl": 0.028350830078125, + "learning_rate": 9.992012645829967e-07, + "loss": 0.1945, + "num_tokens": 261701582.0, + "reward": 0.92431640625, + "reward_std": 0.3164195418357849, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8876953125, - "rewards/tag_count_reward/std": 0.2394239604473114, + "rewards/tag_count_reward/mean": 0.82275390625, + "rewards/tag_count_reward/std": 0.2827472686767578, "step": 344 }, { @@ -9991,27 +9991,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08984375, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1111.126953125, - "completions/mean_terminated_length": 1018.6459350585938, - "completions/min_length": 184.0, - "completions/min_terminated_length": 184.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1006.763671875, + "completions/mean_terminated_length": 881.4508056640625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.11777758811982589, - "grad_norm": 0.35124969482421875, - "kl": 0.044921875, - "learning_rate": 9.991367635085897e-07, - "loss": 0.1177, - "num_tokens": 233943052.0, - "reward": 1.0830078125, - "reward_std": 0.27600592374801636, - "rewards/accuracy_reward/mean": 0.150390625, - "rewards/accuracy_reward/std": 0.35780346393585205, + "grad_norm": 0.1759040355682373, + "kl": 0.031585693359375, + "learning_rate": 9.991690056073353e-07, + "loss": 0.1321, + "num_tokens": 262296197.0, + "reward": 1.0283203125, + "reward_std": 0.33437466621398926, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.1898876577615738, + "rewards/tag_count_reward/mean": 0.8759765625, + "rewards/tag_count_reward/std": 0.23822365701198578, "step": 345 }, { @@ -10020,27 +10020,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.142578125, + "completions/clipped_ratio": 0.1953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 1194.251953125, - "completions/mean_terminated_length": 1052.2847900390625, - "completions/min_length": 222.0, - "completions/min_terminated_length": 222.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1131.197265625, + "completions/mean_terminated_length": 908.67236328125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.11811897243321669, - "grad_norm": 0.41605186462402344, - "kl": 0.055908203125, - "learning_rate": 9.991032539983166e-07, - "loss": 0.1502, - "num_tokens": 234637725.0, - "reward": 1.0078125, - "reward_std": 0.2899217903614044, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, + "grad_norm": 0.14551134407520294, + "kl": 0.025604248046875, + "learning_rate": 9.991361086346352e-07, + "loss": 0.1946, + "num_tokens": 262958586.0, + "reward": 0.9130859375, + "reward_std": 0.2960287630558014, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.888671875, - "rewards/tag_count_reward/std": 0.2419114112854004, + "rewards/tag_count_reward/mean": 0.8193359375, + "rewards/tag_count_reward/std": 0.2859686017036438, "step": 346 }, { @@ -10049,27 +10049,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.115234375, + "completions/clipped_ratio": 0.13671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1016.5390625, - "completions/mean_terminated_length": 882.1986694335938, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 978.11328125, + "completions/mean_terminated_length": 808.6742553710938, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, "epoch": 0.11846035674660749, - "grad_norm": 0.5375961065292358, - "kl": 0.05950927734375, - "learning_rate": 9.990691070681169e-07, - "loss": 0.1481, - "num_tokens": 235232241.0, - "reward": 1.08837890625, - "reward_std": 0.27179259061813354, - "rewards/accuracy_reward/mean": 0.173828125, - "rewards/accuracy_reward/std": 0.3793322443962097, + "grad_norm": 0.16117340326309204, + "kl": 0.030059814453125, + "learning_rate": 9.991025737116235e-07, + "loss": 0.1929, + "num_tokens": 263533428.0, + "reward": 1.044921875, + "reward_std": 0.3310525119304657, + "rewards/accuracy_reward/mean": 0.177734375, + "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.2170523852109909, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.25544461607933044, "step": 347 }, { @@ -10078,27 +10078,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.14453125, + "completions/clipped_ratio": 0.16796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1119.318359375, - "completions/mean_terminated_length": 962.4177856445312, - "completions/min_length": 221.0, - "completions/min_terminated_length": 221.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1041.669921875, + "completions/mean_terminated_length": 838.5140991210938, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.11880174105999829, - "grad_norm": 0.6744995713233948, - "kl": 0.070068359375, - "learning_rate": 9.990343227664552e-07, - "loss": 0.1869, - "num_tokens": 235882772.0, - "reward": 1.04736328125, - "reward_std": 0.35462021827697754, - "rewards/accuracy_reward/mean": 0.1640625, - "rewards/accuracy_reward/std": 0.37069445848464966, + "grad_norm": 0.22051376104354858, + "kl": 0.0433349609375, + "learning_rate": 9.990684008859325e-07, + "loss": 0.2308, + "num_tokens": 264144203.0, + "reward": 1.07177734375, + "reward_std": 0.37083977460861206, + "rewards/accuracy_reward/mean": 0.212890625, + "rewards/accuracy_reward/std": 0.409751296043396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88330078125, - "rewards/tag_count_reward/std": 0.25253984332084656, + "rewards/tag_count_reward/mean": 0.85888671875, + "rewards/tag_count_reward/std": 0.26214927434921265, "step": 348 }, { @@ -10107,27 +10107,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.109375, + "completions/clipped_ratio": 0.158203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1082.482421875, - "completions/mean_terminated_length": 963.9100952148438, - "completions/min_length": 231.0, - "completions/min_terminated_length": 231.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1047.736328125, + "completions/mean_terminated_length": 859.751708984375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, "epoch": 0.11914312537338909, - "grad_norm": 0.274726927280426, - "kl": 0.0433349609375, - "learning_rate": 9.98998901142702e-07, - "loss": 0.0858, - "num_tokens": 236514219.0, - "reward": 1.05810546875, - "reward_std": 0.2895505130290985, - "rewards/accuracy_reward/mean": 0.142578125, - "rewards/accuracy_reward/std": 0.3499840497970581, + "grad_norm": 0.2330407202243805, + "kl": 0.031951904296875, + "learning_rate": 9.990335902061015e-07, + "loss": 0.1538, + "num_tokens": 264757860.0, + "reward": 0.9970703125, + "reward_std": 0.32531172037124634, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20348748564720154, + "rewards/tag_count_reward/mean": 0.8486328125, + "rewards/tag_count_reward/std": 0.27231717109680176, "step": 349 }, { @@ -10136,27 +10136,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.150390625, + "completions/clipped_ratio": 0.189453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1148.9453125, - "completions/mean_terminated_length": 989.8023071289062, - "completions/min_length": 233.0, - "completions/min_terminated_length": 233.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1082.99609375, + "completions/mean_terminated_length": 857.4409790039062, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.11948450968677989, - "grad_norm": 0.8284821510314941, - "kl": 0.0533447265625, - "learning_rate": 9.989628422471316e-07, - "loss": 0.1744, - "num_tokens": 237173295.0, - "reward": 0.93017578125, - "reward_std": 0.30755260586738586, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, + "grad_norm": 0.15811064839363098, + "kl": 0.03271484375, + "learning_rate": 9.989981417215755e-07, + "loss": 0.1847, + "num_tokens": 265383170.0, + "reward": 0.8896484375, + "reward_std": 0.3348539471626282, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87744140625, - "rewards/tag_count_reward/std": 0.247776597738266, + "rewards/tag_count_reward/mean": 0.8271484375, + "rewards/tag_count_reward/std": 0.281360387802124, "step": 350 }, { @@ -10165,27 +10165,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.134765625, + "completions/clipped_ratio": 0.146484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1150.068359375, - "completions/mean_terminated_length": 1010.2099609375, - "completions/min_length": 281.0, - "completions/min_terminated_length": 281.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1050.236328125, + "completions/mean_terminated_length": 878.995361328125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, "epoch": 0.1198258940001707, - "grad_norm": 343831.53125, - "kl": 2177.0279541015625, - "learning_rate": 9.989261461309232e-07, - "loss": 87.8897, - "num_tokens": 237841346.0, - "reward": 1.0849609375, - "reward_std": 0.2732813358306885, - "rewards/accuracy_reward/mean": 0.1875, - "rewards/accuracy_reward/std": 0.39070644974708557, + "grad_norm": 0.14287719130516052, + "kl": 0.027069091796875, + "learning_rate": 9.98962055482705e-07, + "loss": 0.1796, + "num_tokens": 266000107.0, + "reward": 1.01025390625, + "reward_std": 0.33847808837890625, + "rewards/accuracy_reward/mean": 0.1713709682226181, + "rewards/accuracy_reward/std": 0.3772132694721222, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9033203125, - "rewards/tag_count_reward/std": 0.22377894818782806, + "rewards/tag_count_reward/mean": 0.84423828125, + "rewards/tag_count_reward/std": 0.2631678879261017, "step": 351 }, { @@ -10194,27 +10194,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.13671875, + "completions/clipped_ratio": 0.14453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1095.2578125, - "completions/mean_terminated_length": 944.37109375, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1018.263671875, + "completions/mean_terminated_length": 844.2899169921875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, "epoch": 0.1201672783135615, - "grad_norm": 2.281174898147583, - "kl": 0.10009765625, - "learning_rate": 9.9888881284616e-07, - "loss": 0.1497, - "num_tokens": 238469238.0, - "reward": 0.9580078125, - "reward_std": 0.27261272072792053, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, + "grad_norm": 0.14097464084625244, + "kl": 0.0313720703125, + "learning_rate": 9.989253315407466e-07, + "loss": 0.1469, + "num_tokens": 266588578.0, + "reward": 0.9443359375, + "reward_std": 0.3343873620033264, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9013671875, - "rewards/tag_count_reward/std": 0.224017933011055, + "rewards/tag_count_reward/mean": 0.8583984375, + "rewards/tag_count_reward/std": 0.26118385791778564, "step": 352 }, { @@ -10223,27 +10223,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.15234375, + "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1179.5546875, - "completions/mean_terminated_length": 1023.4746704101562, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1103.642578125, + "completions/mean_terminated_length": 907.6438598632812, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, "epoch": 0.1205086626269523, - "grad_norm": 117.5628433227539, - "kl": 1.05743408203125, - "learning_rate": 9.9885084244583e-07, - "loss": 0.213, - "num_tokens": 239156018.0, - "reward": 0.9990234375, - "reward_std": 0.29136621952056885, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, + "grad_norm": 0.1444326490163803, + "kl": 0.031646728515625, + "learning_rate": 9.98887969947863e-07, + "loss": 0.199, + "num_tokens": 267236491.0, + "reward": 0.951171875, + "reward_std": 0.31488221883773804, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8935546875, - "rewards/tag_count_reward/std": 0.2328195720911026, + "rewards/tag_count_reward/mean": 0.837890625, + "rewards/tag_count_reward/std": 0.2655899226665497, "step": 353 }, { @@ -10252,27 +10252,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.189453125, + "completions/clipped_ratio": 0.17578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1276.390625, - "completions/mean_terminated_length": 1096.03857421875, - "completions/min_length": 201.0, - "completions/min_terminated_length": 201.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1112.1328125, + "completions/mean_terminated_length": 912.5403442382812, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, "epoch": 0.1208500469403431, - "grad_norm": 3.7248942852020264, - "kl": 0.09783935546875, - "learning_rate": 9.988122349838247e-07, - "loss": 0.1338, - "num_tokens": 239891034.0, - "reward": 0.9345703125, - "reward_std": 0.2773207426071167, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, + "grad_norm": 0.13784323632717133, + "kl": 0.030548095703125, + "learning_rate": 9.988499707571226e-07, + "loss": 0.2054, + "num_tokens": 267887407.0, + "reward": 0.93310546875, + "reward_std": 0.2967950701713562, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8525390625, - "rewards/tag_count_reward/std": 0.2726678252220154, + "rewards/tag_count_reward/mean": 0.84130859375, + "rewards/tag_count_reward/std": 0.26558586955070496, "step": 354 }, { @@ -10281,27 +10281,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.162109375, + "completions/clipped_ratio": 0.16796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 1155.509765625, - "completions/mean_terminated_length": 982.8368530273438, - "completions/min_length": 279.0, - "completions/min_terminated_length": 279.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 1056.72265625, + "completions/mean_terminated_length": 856.6056518554688, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.1211914312537339, - "grad_norm": 0.44104957580566406, - "kl": 0.0574951171875, - "learning_rate": 9.987729905149411e-07, - "loss": 0.1821, - "num_tokens": 240557519.0, - "reward": 1.03271484375, - "reward_std": 0.33100438117980957, - "rewards/accuracy_reward/mean": 0.15927419066429138, - "rewards/accuracy_reward/std": 0.3663010001182556, + "grad_norm": 0.1503535360097885, + "kl": 0.0325927734375, + "learning_rate": 9.988113340224986e-07, + "loss": 0.1873, + "num_tokens": 268503313.0, + "reward": 0.98681640625, + "reward_std": 0.3488824963569641, + "rewards/accuracy_reward/mean": 0.16129031777381897, + "rewards/accuracy_reward/std": 0.3681698739528656, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87841796875, - "rewards/tag_count_reward/std": 0.25168323516845703, + "rewards/tag_count_reward/mean": 0.83056640625, + "rewards/tag_count_reward/std": 0.2748923897743225, "step": 355 }, { @@ -10310,27 +10310,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08984375, + "completions/clipped_ratio": 0.138671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 1106.876953125, - "completions/mean_terminated_length": 1013.9763793945312, - "completions/min_length": 223.0, - "completions/min_terminated_length": 223.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1055.072265625, + "completions/mean_terminated_length": 895.213134765625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, "epoch": 0.1215328155671247, - "grad_norm": 0.24413111805915833, - "kl": 0.0404052734375, - "learning_rate": 9.98733109094879e-07, - "loss": 0.1194, - "num_tokens": 241204832.0, - "reward": 1.041015625, - "reward_std": 0.22742190957069397, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, + "grad_norm": 0.1478099524974823, + "kl": 0.0308837890625, + "learning_rate": 9.98772059798871e-07, + "loss": 0.1792, + "num_tokens": 269124102.0, + "reward": 0.95947265625, + "reward_std": 0.28720688819885254, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.1836380660533905, + "rewards/tag_count_reward/mean": 0.85986328125, + "rewards/tag_count_reward/std": 0.25367239117622375, "step": 356 }, { @@ -10339,27 +10339,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.16796875, + "completions/clipped_ratio": 0.138671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1181.310546875, - "completions/mean_terminated_length": 1006.3450927734375, - "completions/min_length": 286.0, - "completions/min_terminated_length": 286.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1029.591796875, + "completions/mean_terminated_length": 865.63037109375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.1218741998805155, - "grad_norm": 0.2896995544433594, - "kl": 0.04241943359375, - "learning_rate": 9.98692590780243e-07, - "loss": 0.1404, - "num_tokens": 241885727.0, - "reward": 1.01611328125, - "reward_std": 0.32733917236328125, - "rewards/accuracy_reward/mean": 0.14453125, - "rewards/accuracy_reward/std": 0.35197147727012634, + "grad_norm": 0.15571890771389008, + "kl": 0.029937744140625, + "learning_rate": 9.987321481420244e-07, + "loss": 0.1857, + "num_tokens": 269727317.0, + "reward": 1.0390625, + "reward_std": 0.3653073310852051, + "rewards/accuracy_reward/mean": 0.181640625, + "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87158203125, - "rewards/tag_count_reward/std": 0.2564968764781952, + "rewards/tag_count_reward/mean": 0.857421875, + "rewards/tag_count_reward/std": 0.25923943519592285, "step": 357 }, { @@ -10368,27 +10368,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12890625, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1061.03125, - "completions/mean_terminated_length": 914.9776000976562, - "completions/min_length": 182.0, - "completions/min_terminated_length": 182.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 952.359375, + "completions/mean_terminated_length": 806.9203491210938, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, "epoch": 0.1222155841939063, - "grad_norm": 0.2189946472644806, - "kl": 0.04302978515625, - "learning_rate": 9.986514356285412e-07, - "loss": 0.129, - "num_tokens": 242519743.0, - "reward": 1.095703125, - "reward_std": 0.274413526058197, - "rewards/accuracy_reward/mean": 0.177734375, - "rewards/accuracy_reward/std": 0.3826628625392914, + "grad_norm": 0.28905969858169556, + "kl": 0.04150390625, + "learning_rate": 9.98691599108649e-07, + "loss": 0.1945, + "num_tokens": 270305693.0, + "reward": 1.068359375, + "reward_std": 0.30664169788360596, + "rewards/accuracy_reward/mean": 0.181640625, + "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.19562077522277832, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.22956475615501404, "step": 358 }, { @@ -10397,27 +10397,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.169921875, + "completions/clipped_ratio": 0.189453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1221.43359375, - "completions/mean_terminated_length": 1052.2305908203125, - "completions/min_length": 275.0, - "completions/min_terminated_length": 275.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1121.091796875, + "completions/mean_terminated_length": 904.4409790039062, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.1225569685072971, - "grad_norm": 0.20168673992156982, - "kl": 0.04083251953125, - "learning_rate": 9.986096436981862e-07, - "loss": 0.1191, - "num_tokens": 243226989.0, - "reward": 0.99755859375, - "reward_std": 0.2901397943496704, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, + "grad_norm": 0.1388411968946457, + "kl": 0.02850341796875, + "learning_rate": 9.986504127563407e-07, + "loss": 0.2306, + "num_tokens": 270961564.0, + "reward": 0.93896484375, + "reward_std": 0.3266991376876831, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88818359375, - "rewards/tag_count_reward/std": 0.23107852041721344, + "rewards/tag_count_reward/mean": 0.83154296875, + "rewards/tag_count_reward/std": 0.2790221869945526, "step": 359 }, { @@ -10426,27 +10426,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.142578125, + "completions/clipped_ratio": 0.14453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1172.826171875, - "completions/mean_terminated_length": 1027.296142578125, - "completions/min_length": 247.0, - "completions/min_terminated_length": 247.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1020.287109375, + "completions/mean_terminated_length": 846.6552124023438, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.1228983528206879, - "grad_norm": 0.4305398166179657, - "kl": 0.046630859375, - "learning_rate": 9.985672150484937e-07, - "loss": 0.1307, - "num_tokens": 243910340.0, - "reward": 0.9375, - "reward_std": 0.27414533495903015, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, + "grad_norm": 0.1675242930650711, + "kl": 0.03277587890625, + "learning_rate": 9.986085891436e-07, + "loss": 0.1759, + "num_tokens": 271566815.0, + "reward": 0.93701171875, + "reward_std": 0.31937652826309204, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88671875, - "rewards/tag_count_reward/std": 0.23998405039310455, + "rewards/tag_count_reward/mean": 0.85302734375, + "rewards/tag_count_reward/std": 0.25984644889831543, "step": 360 }, { @@ -10455,27 +10455,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1640625, + "completions/clipped_ratio": 0.17578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 1167.765625, - "completions/mean_terminated_length": 995.0093383789062, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1083.35546875, + "completions/mean_terminated_length": 877.6256103515625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, "epoch": 0.1232397371340787, - "grad_norm": 0.24642539024353027, - "kl": 0.04925537109375, - "learning_rate": 9.985241497396835e-07, - "loss": 0.1211, - "num_tokens": 244590060.0, - "reward": 1.04150390625, - "reward_std": 0.3077473044395447, - "rewards/accuracy_reward/mean": 0.14453125, - "rewards/accuracy_reward/std": 0.35197147727012634, + "grad_norm": 0.1524626463651657, + "kl": 0.03472900390625, + "learning_rate": 9.985661283298332e-07, + "loss": 0.2022, + "num_tokens": 272203317.0, + "reward": 1.02197265625, + "reward_std": 0.3534594774246216, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3810062110424042, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89697265625, - "rewards/tag_count_reward/std": 0.2266613245010376, + "rewards/tag_count_reward/mean": 0.84619140625, + "rewards/tag_count_reward/std": 0.26662003993988037, "step": 361 }, { @@ -10484,27 +10484,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1015625, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1058.2890625, - "completions/mean_terminated_length": 946.4086303710938, - "completions/min_length": 229.0, - "completions/min_terminated_length": 229.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 968.298828125, + "completions/mean_terminated_length": 819.5400390625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, "epoch": 0.1235811214474695, - "grad_norm": 0.2917609214782715, - "kl": 0.0450439453125, - "learning_rate": 9.984804478328792e-07, - "loss": 0.1059, - "num_tokens": 245214256.0, - "reward": 1.029296875, - "reward_std": 0.23584429919719696, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, + "grad_norm": 0.15877166390419006, + "kl": 0.03656005859375, + "learning_rate": 9.985230303753514e-07, + "loss": 0.1785, + "num_tokens": 272781438.0, + "reward": 0.99658203125, + "reward_std": 0.2940343916416168, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.2070399820804596, + "rewards/tag_count_reward/mean": 0.87353515625, + "rewards/tag_count_reward/std": 0.2512158453464508, "step": 362 }, { @@ -10513,27 +10513,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.134765625, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 1088.248046875, - "completions/mean_terminated_length": 938.7607421875, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 936.013671875, + "completions/mean_terminated_length": 791.1854248046875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, "epoch": 0.1239225057608603, - "grad_norm": 0.21121852099895477, - "kl": 0.0457763671875, - "learning_rate": 9.98436109390107e-07, - "loss": 0.127, - "num_tokens": 245853839.0, - "reward": 1.068359375, - "reward_std": 0.3218876123428345, - "rewards/accuracy_reward/mean": 0.173828125, - "rewards/accuracy_reward/std": 0.3793322443962097, + "grad_norm": 0.1602502167224884, + "kl": 0.03466796875, + "learning_rate": 9.984792953413704e-07, + "loss": 0.1606, + "num_tokens": 273343077.0, + "reward": 1.0615234375, + "reward_std": 0.372483491897583, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89453125, - "rewards/tag_count_reward/std": 0.23168610036373138, + "rewards/tag_count_reward/mean": 0.8740234375, + "rewards/tag_count_reward/std": 0.24975334107875824, "step": 363 }, { @@ -10542,27 +10542,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.224609375, + "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1301.646484375, - "completions/mean_terminated_length": 1085.4483642578125, - "completions/min_length": 289.0, - "completions/min_terminated_length": 289.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1189.833984375, + "completions/mean_terminated_length": 949.5474853515625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.1242638900742511, - "grad_norm": 0.4051723778247833, - "kl": 0.0487060546875, - "learning_rate": 9.983911344742979e-07, - "loss": 0.1438, - "num_tokens": 246587770.0, - "reward": 0.97314453125, - "reward_std": 0.3036273121833801, - "rewards/accuracy_reward/mean": 0.134765625, - "rewards/accuracy_reward/std": 0.3418070077896118, + "grad_norm": 0.13621152937412262, + "kl": 0.032318115234375, + "learning_rate": 9.984349232900116e-07, + "loss": 0.2168, + "num_tokens": 274019760.0, + "reward": 0.94775390625, + "reward_std": 0.35309305787086487, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83837890625, - "rewards/tag_count_reward/std": 0.2800033390522003, + "rewards/tag_count_reward/mean": 0.79931640625, + "rewards/tag_count_reward/std": 0.2977977395057678, "step": 364 }, { @@ -10571,27 +10571,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.140625, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 1169.8203125, - "completions/mean_terminated_length": 1026.1181640625, - "completions/min_length": 269.0, - "completions/min_terminated_length": 269.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 993.0703125, + "completions/mean_terminated_length": 847.7244873046875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.12460527438764189, - "grad_norm": 1.240547776222229, - "kl": 0.070068359375, - "learning_rate": 9.983455231492852e-07, - "loss": 0.1581, - "num_tokens": 247262510.0, - "reward": 1.03173828125, - "reward_std": 0.31879186630249023, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, + "grad_norm": 0.150365948677063, + "kl": 0.031402587890625, + "learning_rate": 9.983899142843003e-07, + "loss": 0.17, + "num_tokens": 274604004.0, + "reward": 1.044921875, + "reward_std": 0.33601388335227966, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89111328125, - "rewards/tag_count_reward/std": 0.2392178475856781, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.23758302628993988, "step": 365 }, { @@ -10600,27 +10600,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.15625, + "completions/clipped_ratio": 0.142578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1166.392578125, - "completions/mean_terminated_length": 1003.1319580078125, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1029.462890625, + "completions/mean_terminated_length": 860.0934448242188, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.12494665870103269, - "grad_norm": 0.8436083793640137, - "kl": 0.073974609375, - "learning_rate": 9.982992754798057e-07, - "loss": 0.1818, - "num_tokens": 247936503.0, - "reward": 1.0615234375, - "reward_std": 0.37862253189086914, - "rewards/accuracy_reward/mean": 0.18359375, - "rewards/accuracy_reward/std": 0.3875311613082886, + "grad_norm": 0.1535954773426056, + "kl": 0.0340576171875, + "learning_rate": 9.983442683881674e-07, + "loss": 0.1953, + "num_tokens": 275207889.0, + "reward": 1.0068359375, + "reward_std": 0.3589969575405121, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8779296875, - "rewards/tag_count_reward/std": 0.24924781918525696, + "rewards/tag_count_reward/mean": 0.8544921875, + "rewards/tag_count_reward/std": 0.2646293640136719, "step": 366 }, { @@ -10629,27 +10629,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.173828125, + "completions/clipped_ratio": 0.22265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1193.67578125, - "completions/mean_terminated_length": 1013.9243774414062, - "completions/min_length": 231.0, - "completions/min_terminated_length": 231.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1118.158203125, + "completions/mean_terminated_length": 851.8215942382812, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.12528804301442348, - "grad_norm": 378.8311462402344, - "kl": 3.0299072265625, - "learning_rate": 9.982523915315e-07, - "loss": 0.2734, - "num_tokens": 248625025.0, - "reward": 0.99462890625, - "reward_std": 0.28291645646095276, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, + "grad_norm": 0.15709184110164642, + "kl": 0.03350830078125, + "learning_rate": 9.98297985666448e-07, + "loss": 0.2782, + "num_tokens": 275857746.0, + "reward": 0.93017578125, + "reward_std": 0.32532137632369995, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88330078125, - "rewards/tag_count_reward/std": 0.24516746401786804, + "rewards/tag_count_reward/mean": 0.81298828125, + "rewards/tag_count_reward/std": 0.29354146122932434, "step": 367 }, { @@ -10658,27 +10658,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.150390625, + "completions/clipped_ratio": 0.158203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1200.595703125, - "completions/mean_terminated_length": 1050.595458984375, - "completions/min_length": 257.0, - "completions/min_terminated_length": 257.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1093.9609375, + "completions/mean_terminated_length": 914.6635131835938, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.1256294273278143, - "grad_norm": 4.661107540130615, - "kl": 0.2315673828125, - "learning_rate": 9.982048713709109e-07, - "loss": 0.1408, - "num_tokens": 249312050.0, - "reward": 1.10302734375, - "reward_std": 0.33130943775177, - "rewards/accuracy_reward/mean": 0.208984375, - "rewards/accuracy_reward/std": 0.40698084235191345, + "grad_norm": 0.13502192497253418, + "kl": 0.033782958984375, + "learning_rate": 9.982510661848819e-07, + "loss": 0.1753, + "num_tokens": 276490174.0, + "reward": 1.0791015625, + "reward_std": 0.38882726430892944, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.42882615327835083, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89404296875, - "rewards/tag_count_reward/std": 0.23066876828670502, + "rewards/tag_count_reward/mean": 0.8369140625, + "rewards/tag_count_reward/std": 0.272723913192749, "step": 368 }, { @@ -10687,27 +10687,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.14453125, + "completions/clipped_ratio": 0.111328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1161.1875, - "completions/mean_terminated_length": 1011.3607177734375, - "completions/min_length": 68.0, - "completions/min_terminated_length": 68.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 976.408203125, + "completions/mean_terminated_length": 842.1648559570312, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.12597081164120508, - "grad_norm": 42.994876861572266, - "kl": 1.1650390625, - "learning_rate": 9.981567150654848e-07, - "loss": 0.1515, - "num_tokens": 249979122.0, - "reward": 1.07666015625, - "reward_std": 0.32884156703948975, - "rewards/accuracy_reward/mean": 0.169921875, - "rewards/accuracy_reward/std": 0.3759314715862274, + "grad_norm": 0.1473228484392166, + "kl": 0.033416748046875, + "learning_rate": 9.98203510010113e-07, + "loss": 0.1656, + "num_tokens": 277062639.0, + "reward": 1.07421875, + "reward_std": 0.32450687885284424, + "rewards/accuracy_reward/mean": 0.177734375, + "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.2254990190267563, + "rewards/tag_count_reward/mean": 0.896484375, + "rewards/tag_count_reward/std": 0.23256702721118927, "step": 369 }, { @@ -10716,27 +10716,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.169921875, + "completions/clipped_ratio": 0.23046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1240.078125, - "completions/mean_terminated_length": 1074.6917724609375, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/mean_length": 1193.373046875, + "completions/mean_terminated_length": 937.4187622070312, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.1263121959545959, - "grad_norm": 2410563.0, - "kl": 15131.19091796875, - "learning_rate": 9.981079226835704e-07, - "loss": 605.8589, - "num_tokens": 250689994.0, - "reward": 1.02685546875, - "reward_std": 0.35881370306015015, + "grad_norm": 0.1366780698299408, + "kl": 0.031280517578125, + "learning_rate": 9.981553172096898e-07, + "loss": 0.1669, + "num_tokens": 277749598.0, + "reward": 0.96044921875, + "reward_std": 0.35311082005500793, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87646484375, - "rewards/tag_count_reward/std": 0.2487695813179016, + "rewards/tag_count_reward/mean": 0.81005859375, + "rewards/tag_count_reward/std": 0.299512654542923, "step": 370 }, { @@ -10745,27 +10745,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.115234375, + "completions/clipped_ratio": 0.216796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 1174.806640625, - "completions/mean_terminated_length": 1061.0794677734375, - "completions/min_length": 224.0, - "completions/min_terminated_length": 224.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1137.755859375, + "completions/mean_terminated_length": 885.7930297851562, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.12665358026798668, - "grad_norm": 87.1528091430664, - "kl": 0.883056640625, - "learning_rate": 9.9805849429442e-07, - "loss": 0.1969, - "num_tokens": 251368151.0, - "reward": 1.0009765625, - "reward_std": 0.2787174582481384, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, + "grad_norm": 0.14127393066883087, + "kl": 0.035888671875, + "learning_rate": 9.981064878520655e-07, + "loss": 0.2243, + "num_tokens": 278408785.0, + "reward": 0.91845703125, + "reward_std": 0.33893805742263794, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.22324882447719574, + "rewards/tag_count_reward/mean": 0.81494140625, + "rewards/tag_count_reward/std": 0.2968464195728302, "step": 371 }, { @@ -10774,27 +10774,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.138671875, + "completions/clipped_ratio": 0.150390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 1187.529296875, - "completions/mean_terminated_length": 1048.9954833984375, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1069.248046875, + "completions/mean_terminated_length": 895.9976806640625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.1269949645813775, - "grad_norm": 0.4278341233730316, - "kl": 0.08721923828125, - "learning_rate": 9.980084299681879e-07, - "loss": 0.1303, - "num_tokens": 252067190.0, - "reward": 0.96435546875, - "reward_std": 0.2680964469909668, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, + "grad_norm": 0.1410568654537201, + "kl": 0.03564453125, + "learning_rate": 9.980570220065969e-07, + "loss": 0.18, + "num_tokens": 279047264.0, + "reward": 0.9501953125, + "reward_std": 0.2930806279182434, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89208984375, - "rewards/tag_count_reward/std": 0.23240621387958527, + "rewards/tag_count_reward/mean": 0.8583984375, + "rewards/tag_count_reward/std": 0.26211875677108765, "step": 372 }, { @@ -10803,27 +10803,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12109375, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1125.94140625, - "completions/mean_terminated_length": 998.9022216796875, - "completions/min_length": 229.0, - "completions/min_terminated_length": 229.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 997.392578125, + "completions/mean_terminated_length": 868.37060546875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.12733634889476828, - "grad_norm": 0.4684431254863739, - "kl": 0.0633544921875, - "learning_rate": 9.979577297759312e-07, - "loss": 0.1117, - "num_tokens": 252718296.0, - "reward": 1.10400390625, - "reward_std": 0.316215455532074, - "rewards/accuracy_reward/mean": 0.19921875, - "rewards/accuracy_reward/std": 0.39980348944664, + "grad_norm": 0.14614659547805786, + "kl": 0.030242919921875, + "learning_rate": 9.980069197435444e-07, + "loss": 0.1732, + "num_tokens": 279632553.0, + "reward": 1.1259765625, + "reward_std": 0.35918954014778137, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4190165400505066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90478515625, - "rewards/tag_count_reward/std": 0.21860963106155396, + "rewards/tag_count_reward/mean": 0.8994140625, + "rewards/tag_count_reward/std": 0.22094275057315826, "step": 373 }, { @@ -10832,27 +10832,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.173828125, + "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1212.505859375, - "completions/mean_terminated_length": 1036.71630859375, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1083.3984375, + "completions/mean_terminated_length": 935.6666870117188, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, "epoch": 0.1276777332081591, - "grad_norm": 0.7251440286636353, - "kl": 0.06719970703125, - "learning_rate": 9.979063937896098e-07, - "loss": 0.1669, - "num_tokens": 253413307.0, - "reward": 1.015625, - "reward_std": 0.32031503319740295, - "rewards/accuracy_reward/mean": 0.134765625, - "rewards/accuracy_reward/std": 0.3418070077896118, + "grad_norm": 0.16419485211372375, + "kl": 0.03302001953125, + "learning_rate": 9.979561811340737e-07, + "loss": 0.1638, + "num_tokens": 280261461.0, + "reward": 1.03173828125, + "reward_std": 0.3253314793109894, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.880859375, - "rewards/tag_count_reward/std": 0.24771922826766968, + "rewards/tag_count_reward/mean": 0.87158203125, + "rewards/tag_count_reward/std": 0.24776503443717957, "step": 374 }, { @@ -10861,27 +10861,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1171875, + "completions/clipped_ratio": 0.13671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1116.748046875, - "completions/mean_terminated_length": 993.1305541992188, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1004.001953125, + "completions/mean_terminated_length": 838.6629028320312, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, "epoch": 0.12801911752154987, - "grad_norm": 0.4610971510410309, - "kl": 0.05511474609375, - "learning_rate": 9.978544220820858e-07, - "loss": 0.0899, - "num_tokens": 254067322.0, - "reward": 1.0341796875, - "reward_std": 0.27670881152153015, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310528099536896, + "grad_norm": 0.16056229174137115, + "kl": 0.0382080078125, + "learning_rate": 9.979048062502532e-07, + "loss": 0.1912, + "num_tokens": 280857750.0, + "reward": 1.0078125, + "reward_std": 0.3076658844947815, + "rewards/accuracy_reward/mean": 0.13104838132858276, + "rewards/accuracy_reward/std": 0.3377939462661743, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.20515529811382294, + "rewards/tag_count_reward/mean": 0.880859375, + "rewards/tag_count_reward/std": 0.24573633074760437, "step": 375 }, { @@ -10890,27 +10890,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.17578125, + "completions/clipped_ratio": 0.1484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1296.08203125, - "completions/mean_terminated_length": 1135.720458984375, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1140.400390625, + "completions/mean_terminated_length": 982.1948852539062, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.1283605018349407, - "grad_norm": 0.5585299730300903, - "kl": 0.0692138671875, - "learning_rate": 9.978018147271232e-07, - "loss": 0.1125, - "num_tokens": 254811396.0, - "reward": 0.9990234375, - "reward_std": 0.2815853953361511, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, + "grad_norm": 0.15369459986686707, + "kl": 0.03521728515625, + "learning_rate": 9.978527951650558e-07, + "loss": 0.1735, + "num_tokens": 281522115.0, + "reward": 0.96533203125, + "reward_std": 0.3165265917778015, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8916015625, - "rewards/tag_count_reward/std": 0.22872866690158844, + "rewards/tag_count_reward/mean": 0.86376953125, + "rewards/tag_count_reward/std": 0.2605331838130951, "step": 376 }, { @@ -10919,27 +10919,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.142578125, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 1152.908203125, - "completions/mean_terminated_length": 1004.0661010742188, - "completions/min_length": 245.0, - "completions/min_terminated_length": 245.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1010.84765625, + "completions/mean_terminated_length": 886.0262451171875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.12870188614833147, - "grad_norm": 0.8666077256202698, - "kl": 0.1131591796875, - "learning_rate": 9.977485717993885e-07, - "loss": 0.1211, - "num_tokens": 255469333.0, - "reward": 1.01318359375, - "reward_std": 0.22732698917388916, - "rewards/accuracy_reward/mean": 0.11895161122083664, - "rewards/accuracy_reward/std": 0.3240584135055542, + "grad_norm": 0.12651118636131287, + "kl": 0.031890869140625, + "learning_rate": 9.978001479523573e-07, + "loss": 0.1193, + "num_tokens": 282107317.0, + "reward": 1.01416015625, + "reward_std": 0.28059664368629456, + "rewards/accuracy_reward/mean": 0.13104838132858276, + "rewards/accuracy_reward/std": 0.3377939462661743, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89794921875, - "rewards/tag_count_reward/std": 0.22548207640647888, + "rewards/tag_count_reward/mean": 0.88720703125, + "rewards/tag_count_reward/std": 0.23480726778507233, "step": 377 }, { @@ -10948,27 +10948,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1640625, + "completions/clipped_ratio": 0.158203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1194.314453125, - "completions/mean_terminated_length": 1026.7686767578125, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1064.865234375, + "completions/mean_terminated_length": 880.0997314453125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, "epoch": 0.1290432704617223, - "grad_norm": 3.573004961013794, - "kl": 0.22021484375, - "learning_rate": 9.976946933744505e-07, - "loss": 0.1275, - "num_tokens": 256150294.0, - "reward": 1.00341796875, - "reward_std": 0.28133586049079895, + "grad_norm": 0.1738119125366211, + "kl": 0.03253173828125, + "learning_rate": 9.97746864686938e-07, + "loss": 0.2181, + "num_tokens": 282722000.0, + "reward": 0.9736328125, + "reward_std": 0.34592851996421814, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89794921875, - "rewards/tag_count_reward/std": 0.21943418681621552, + "rewards/tag_count_reward/mean": 0.8681640625, + "rewards/tag_count_reward/std": 0.2625558376312256, "step": 378 }, { @@ -10977,27 +10977,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.203125, + "completions/clipped_ratio": 0.1484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 1269.986328125, - "completions/mean_terminated_length": 1071.669189453125, - "completions/min_length": 226.0, - "completions/min_terminated_length": 226.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 1055.5625, + "completions/mean_terminated_length": 882.5687866210938, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, "epoch": 0.12938465477511307, - "grad_norm": 5.827714443206787, - "kl": 0.137451171875, - "learning_rate": 9.9764017952878e-07, - "loss": 0.1455, - "num_tokens": 256875423.0, - "reward": 0.95947265625, - "reward_std": 0.28445520997047424, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, + "grad_norm": 0.14871911704540253, + "kl": 0.032745361328125, + "learning_rate": 9.976929454444809e-07, + "loss": 0.1825, + "num_tokens": 283337344.0, + "reward": 0.95849609375, + "reward_std": 0.29456979036331177, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86572265625, - "rewards/tag_count_reward/std": 0.2500721216201782, + "rewards/tag_count_reward/mean": 0.87060546875, + "rewards/tag_count_reward/std": 0.2540864050388336, "step": 379 }, { @@ -11006,27 +11006,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.166015625, + "completions/clipped_ratio": 0.130859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1251.505859375, - "completions/mean_terminated_length": 1092.953125, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1074.015625, + "completions/mean_terminated_length": 927.3707885742188, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, "epoch": 0.12972603908850389, - "grad_norm": 0.39755210280418396, - "kl": 0.0858154296875, - "learning_rate": 9.97585030339749e-07, - "loss": 0.1089, - "num_tokens": 257591778.0, - "reward": 1.0390625, - "reward_std": 0.2732459306716919, - "rewards/accuracy_reward/mean": 0.13671875, - "rewards/accuracy_reward/std": 0.3438861668109894, + "grad_norm": 0.13046395778656006, + "kl": 0.031890869140625, + "learning_rate": 9.976383903015724e-07, + "loss": 0.1671, + "num_tokens": 283962824.0, + "reward": 1.03076171875, + "reward_std": 0.311119943857193, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90234375, - "rewards/tag_count_reward/std": 0.21269488334655762, + "rewards/tag_count_reward/mean": 0.89013671875, + "rewards/tag_count_reward/std": 0.24131768941879272, "step": 380 }, { @@ -11035,27 +11035,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1953125, + "completions/clipped_ratio": 0.15234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1232.404296875, - "completions/mean_terminated_length": 1034.4442138671875, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1075.09765625, + "completions/mean_terminated_length": 900.2442626953125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, "epoch": 0.13006742340189467, - "grad_norm": 0.8170649409294128, - "kl": 0.09619140625, - "learning_rate": 9.97529245885632e-07, - "loss": 0.1599, - "num_tokens": 258300513.0, - "reward": 0.93017578125, - "reward_std": 0.27194511890411377, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, + "grad_norm": 0.14830279350280762, + "kl": 0.031829833984375, + "learning_rate": 9.975831993357026e-07, + "loss": 0.2256, + "num_tokens": 284591018.0, + "reward": 0.94677734375, + "reward_std": 0.27035853266716003, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85595703125, - "rewards/tag_count_reward/std": 0.2642766237258911, + "rewards/tag_count_reward/mean": 0.87060546875, + "rewards/tag_count_reward/std": 0.25360459089279175, "step": 381 }, { @@ -11064,27 +11064,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.21875, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1328.248046875, - "completions/mean_terminated_length": 1126.717529296875, - "completions/min_length": 261.0, - "completions/min_terminated_length": 261.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1038.443359375, + "completions/mean_terminated_length": 919.4126586914062, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.13040880771528549, - "grad_norm": 2.283935785293579, - "kl": 0.16015625, - "learning_rate": 9.974728262456043e-07, - "loss": 0.1189, - "num_tokens": 259054352.0, - "reward": 0.9736328125, - "reward_std": 0.27440378069877625, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, + "grad_norm": 0.15144680440425873, + "kl": 0.0301513671875, + "learning_rate": 9.975273726252644e-07, + "loss": 0.167, + "num_tokens": 285196477.0, + "reward": 1.0185546875, + "reward_std": 0.2786235809326172, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8798828125, - "rewards/tag_count_reward/std": 0.23087425529956818, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.21141289174556732, "step": 382 }, { @@ -11093,27 +11093,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.19921875, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 1287.685546875, - "completions/mean_terminated_length": 1098.5341796875, - "completions/min_length": 282.0, - "completions/min_terminated_length": 282.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 995.138671875, + "completions/mean_terminated_length": 910.7319946289062, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, "epoch": 0.13075019202867627, - "grad_norm": 0.4480745792388916, - "kl": 0.095703125, - "learning_rate": 9.974157714997438e-07, - "loss": 0.1528, - "num_tokens": 259792623.0, - "reward": 1.0263671875, - "reward_std": 0.3242665231227875, - "rewards/accuracy_reward/mean": 0.1484375, - "rewards/accuracy_reward/std": 0.35588082671165466, + "grad_norm": 0.14347217977046967, + "kl": 0.028839111328125, + "learning_rate": 9.974709102495536e-07, + "loss": 0.1066, + "num_tokens": 285784964.0, + "reward": 1.05908203125, + "reward_std": 0.3068065345287323, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8779296875, - "rewards/tag_count_reward/std": 0.23562633991241455, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.19569343328475952, "step": 383 }, { @@ -11122,27 +11122,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.21875, + "completions/clipped_ratio": 0.14453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1343.193359375, - "completions/mean_terminated_length": 1145.8475341796875, - "completions/min_length": 233.0, - "completions/min_terminated_length": 233.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1112.611328125, + "completions/mean_terminated_length": 954.5775756835938, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.13109157634206708, - "grad_norm": 0.9114007353782654, - "kl": 0.111328125, - "learning_rate": 9.973580817290295e-07, - "loss": 0.1672, - "num_tokens": 260552306.0, - "reward": 0.958984375, - "reward_std": 0.29250961542129517, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, + "grad_norm": 0.13277432322502136, + "kl": 0.031402587890625, + "learning_rate": 9.974138122887689e-07, + "loss": 0.1369, + "num_tokens": 286426589.0, + "reward": 0.966796875, + "reward_std": 0.3141400218009949, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87109375, - "rewards/tag_count_reward/std": 0.24775780737400055, + "rewards/tag_count_reward/mean": 0.869140625, + "rewards/tag_count_reward/std": 0.25017574429512024, "step": 384 }, { @@ -11151,27 +11151,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.181640625, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1200.03515625, - "completions/mean_terminated_length": 1011.8234252929688, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/mean_length": 989.17578125, + "completions/mean_terminated_length": 859.1447143554688, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, "epoch": 0.13143296065545787, - "grad_norm": 1.0169259309768677, - "kl": 0.092041015625, - "learning_rate": 9.972997570153406e-07, - "loss": 0.1167, - "num_tokens": 261251252.0, - "reward": 0.95068359375, - "reward_std": 0.26185333728790283, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, + "grad_norm": 0.1668711006641388, + "kl": 0.03533935546875, + "learning_rate": 9.973560788240122e-07, + "loss": 0.1547, + "num_tokens": 287017575.0, + "reward": 1.021484375, + "reward_std": 0.3046708106994629, + "rewards/accuracy_reward/mean": 0.12298387289047241, + "rewards/accuracy_reward/std": 0.32875028252601624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88037109375, - "rewards/tag_count_reward/std": 0.238677978515625, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.22280539572238922, "step": 385 }, { @@ -11180,27 +11180,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.205078125, + "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1313.078125, - "completions/mean_terminated_length": 1123.4791259765625, - "completions/min_length": 221.0, - "completions/min_terminated_length": 221.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1106.5078125, + "completions/mean_terminated_length": 932.1574096679688, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.13177434496884868, - "grad_norm": 0.7724472284317017, - "kl": 0.095458984375, - "learning_rate": 9.972407974414592e-07, - "loss": 0.1242, - "num_tokens": 261994236.0, - "reward": 1.001953125, - "reward_std": 0.3082892596721649, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, + "grad_norm": 0.15250162780284882, + "kl": 0.0341796875, + "learning_rate": 9.972977099372877e-07, + "loss": 0.1607, + "num_tokens": 287654795.0, + "reward": 0.99755859375, + "reward_std": 0.30660396814346313, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87890625, - "rewards/tag_count_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.88818359375, + "rewards/tag_count_reward/std": 0.23371002078056335, "step": 386 }, { @@ -11209,27 +11209,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.171875, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 1262.189453125, - "completions/mean_terminated_length": 1099.0966796875, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1047.12109375, + "completions/mean_terminated_length": 914.2610473632812, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.13211572928223947, - "grad_norm": 0.38649216294288635, - "kl": 0.1070556640625, - "learning_rate": 9.971812030910671e-07, - "loss": 0.1158, - "num_tokens": 262722397.0, - "reward": 0.90771484375, - "reward_std": 0.21668007969856262, - "rewards/accuracy_reward/mean": 0.021484375, - "rewards/accuracy_reward/std": 0.14513419568538666, + "grad_norm": 0.15170042216777802, + "kl": 0.03753662109375, + "learning_rate": 9.972387057115022e-07, + "loss": 0.2021, + "num_tokens": 288272841.0, + "reward": 0.9462890625, + "reward_std": 0.2462095320224762, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88623046875, - "rewards/tag_count_reward/std": 0.23589536547660828, + "rewards/tag_count_reward/mean": 0.9052734375, + "rewards/tag_count_reward/std": 0.226781964302063, "step": 387 }, { @@ -11238,27 +11238,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.15234375, + "completions/clipped_ratio": 0.177734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 1263.587890625, - "completions/mean_terminated_length": 1122.610595703125, - "completions/min_length": 257.0, - "completions/min_terminated_length": 257.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1152.87109375, + "completions/mean_terminated_length": 959.38720703125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, "epoch": 0.13245711359563028, - "grad_norm": 66.00189208984375, - "kl": 0.6612548828125, - "learning_rate": 9.971209740487478e-07, - "loss": 0.1217, - "num_tokens": 263458058.0, - "reward": 0.98486328125, - "reward_std": 0.25900518894195557, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, + "grad_norm": 0.12724484503269196, + "kl": 0.030914306640625, + "learning_rate": 9.97179066230465e-07, + "loss": 0.1602, + "num_tokens": 288951815.0, + "reward": 0.93994140625, + "reward_std": 0.3069894313812256, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89501953125, - "rewards/tag_count_reward/std": 0.23111572861671448, + "rewards/tag_count_reward/mean": 0.86181640625, + "rewards/tag_count_reward/std": 0.26967042684555054, "step": 388 }, { @@ -11267,27 +11267,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.166015625, + "completions/clipped_ratio": 0.13671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1226.498046875, - "completions/mean_terminated_length": 1062.9671630859375, - "completions/min_length": 299.0, - "completions/min_terminated_length": 299.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1052.642578125, + "completions/mean_terminated_length": 895.0068359375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.13279849790902107, - "grad_norm": 0.5843941569328308, - "kl": 0.1075439453125, - "learning_rate": 9.970601103999854e-07, - "loss": 0.1177, - "num_tokens": 264166121.0, - "reward": 0.9853515625, - "reward_std": 0.22699961066246033, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, + "grad_norm": 0.14453551173210144, + "kl": 0.03594970703125, + "learning_rate": 9.971187915788875e-07, + "loss": 0.209, + "num_tokens": 289570864.0, + "reward": 0.98828125, + "reward_std": 0.2822495996952057, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9013671875, - "rewards/tag_count_reward/std": 0.21339233219623566, + "rewards/tag_count_reward/mean": 0.876953125, + "rewards/tag_count_reward/std": 0.2443011850118637, "step": 389 }, { @@ -11296,27 +11296,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.150390625, + "completions/clipped_ratio": 0.142578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1167.7578125, - "completions/mean_terminated_length": 1011.94482421875, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1055.484375, + "completions/mean_terminated_length": 890.4419555664062, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, "epoch": 0.13313988222241188, - "grad_norm": 1.1302576065063477, - "kl": 0.103515625, - "learning_rate": 9.969986122311648e-07, - "loss": 0.1136, - "num_tokens": 264841197.0, - "reward": 1.03173828125, - "reward_std": 0.2919134795665741, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, + "grad_norm": 0.14505472779273987, + "kl": 0.033782958984375, + "learning_rate": 9.970578818423837e-07, + "loss": 0.1256, + "num_tokens": 290188456.0, + "reward": 1.01611328125, + "reward_std": 0.2951487898826599, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89111328125, - "rewards/tag_count_reward/std": 0.22280485928058624, + "rewards/tag_count_reward/mean": 0.86767578125, + "rewards/tag_count_reward/std": 0.26020291447639465, "step": 390 }, { @@ -11325,27 +11325,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.095703125, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1157.86328125, - "completions/mean_terminated_length": 1063.65869140625, - "completions/min_length": 202.0, - "completions/min_terminated_length": 202.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 953.166015625, + "completions/mean_terminated_length": 857.862060546875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, "epoch": 0.13348126653580267, - "grad_norm": 3.2016637325286865, - "kl": 0.1668701171875, - "learning_rate": 9.969364796295712e-07, - "loss": 0.1033, - "num_tokens": 265512919.0, - "reward": 1.111328125, - "reward_std": 0.3207700550556183, - "rewards/accuracy_reward/mean": 0.17578125, - "rewards/accuracy_reward/std": 0.3810062110424042, + "grad_norm": 0.1563599556684494, + "kl": 0.03515625, + "learning_rate": 9.969963371074693e-07, + "loss": 0.1462, + "num_tokens": 290755373.0, + "reward": 1.06884765625, + "reward_std": 0.3398585915565491, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.17060412466526031, + "rewards/tag_count_reward/mean": 0.91064453125, + "rewards/tag_count_reward/std": 0.2137608826160431, "step": 391 }, { @@ -11354,27 +11354,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.126953125, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1230.892578125, - "completions/mean_terminated_length": 1112.0738525390625, - "completions/min_length": 217.0, - "completions/min_terminated_length": 217.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1031.66796875, + "completions/mean_terminated_length": 911.8384399414062, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.13382265084919348, - "grad_norm": 2.2589118480682373, - "kl": 0.163330078125, - "learning_rate": 9.968737126833905e-07, - "loss": 0.1171, - "num_tokens": 266216512.0, - "reward": 1.00244140625, - "reward_std": 0.250763475894928, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, + "grad_norm": 0.16045336425304413, + "kl": 0.032470703125, + "learning_rate": 9.96934157461562e-07, + "loss": 0.1425, + "num_tokens": 291356963.0, + "reward": 0.9990234375, + "reward_std": 0.28106868267059326, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.19151799380779266, + "rewards/tag_count_reward/mean": 0.8994140625, + "rewards/tag_count_reward/std": 0.2258700281381607, "step": 392 }, { @@ -11383,27 +11383,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.126953125, + "completions/clipped_ratio": 0.134765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1169.677734375, - "completions/mean_terminated_length": 1041.95751953125, - "completions/min_length": 286.0, - "completions/min_terminated_length": 286.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 1034.720703125, + "completions/mean_terminated_length": 876.8961791992188, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.13416403516258427, - "grad_norm": 0.7035832405090332, - "kl": 0.1094970703125, - "learning_rate": 9.96810311481709e-07, - "loss": 0.1273, - "num_tokens": 266890811.0, - "reward": 1.08056640625, - "reward_std": 0.2758799195289612, - "rewards/accuracy_reward/mean": 0.162109375, - "rewards/accuracy_reward/std": 0.3689115643501282, + "grad_norm": 4.809753894805908, + "kl": 0.1025390625, + "learning_rate": 9.968713429929818e-07, + "loss": 0.1725, + "num_tokens": 291962164.0, + "reward": 1.06787109375, + "reward_std": 0.3107529878616333, + "rewards/accuracy_reward/mean": 0.181640625, + "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.19799919426441193, + "rewards/tag_count_reward/mean": 0.88623046875, + "rewards/tag_count_reward/std": 0.24554912745952606, "step": 393 }, { @@ -11412,27 +11412,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12109375, + "completions/clipped_ratio": 0.13671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1150.82421875, - "completions/mean_terminated_length": 1027.21337890625, - "completions/min_length": 228.0, - "completions/min_terminated_length": 228.0, + "completions/mean_length": 1014.494140625, + "completions/mean_terminated_length": 850.8167724609375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, "epoch": 0.13450541947597508, - "grad_norm": 3.396650552749634, - "kl": 0.1527099609375, - "learning_rate": 9.967462761145132e-07, - "loss": 0.1233, - "num_tokens": 267549761.0, - "reward": 0.966796875, - "reward_std": 0.21518467366695404, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, + "grad_norm": 0.15697811543941498, + "kl": 0.0338134765625, + "learning_rate": 9.968078937909493e-07, + "loss": 0.2401, + "num_tokens": 292551313.0, + "reward": 0.96142578125, + "reward_std": 0.25520914793014526, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.1921909898519516, + "rewards/tag_count_reward/mean": 0.88916015625, + "rewards/tag_count_reward/std": 0.24238838255405426, "step": 394 }, { @@ -11441,27 +11441,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.103515625, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1150.5234375, - "completions/mean_terminated_length": 1046.8931884765625, - "completions/min_length": 331.0, - "completions/min_terminated_length": 331.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1046.177734375, + "completions/mean_terminated_length": 935.3471069335938, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.13484680378936587, - "grad_norm": 0.3927019238471985, - "kl": 0.091552734375, - "learning_rate": 9.966816066726895e-07, - "loss": 0.0801, - "num_tokens": 268208397.0, - "reward": 1.154296875, - "reward_std": 0.3116268515586853, - "rewards/accuracy_reward/mean": 0.224609375, - "rewards/accuracy_reward/std": 0.41773295402526855, + "grad_norm": 0.22697971761226654, + "kl": 0.039886474609375, + "learning_rate": 9.967438099455881e-07, + "loss": 0.1606, + "num_tokens": 293156524.0, + "reward": 1.16064453125, + "reward_std": 0.3526654541492462, + "rewards/accuracy_reward/mean": 0.259765625, + "rewards/accuracy_reward/std": 0.4389347732067108, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.1835651993751526, + "rewards/tag_count_reward/mean": 0.90087890625, + "rewards/tag_count_reward/std": 0.23106196522712708, "step": 395 }, { @@ -11470,27 +11470,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.142578125, + "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1194.126953125, - "completions/mean_terminated_length": 1052.1390380859375, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1121.23046875, + "completions/mean_terminated_length": 969.5772705078125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.13518818810275668, - "grad_norm": 0.8550708293914795, - "kl": 0.1165771484375, - "learning_rate": 9.96616303248024e-07, - "loss": 0.1276, - "num_tokens": 268900174.0, - "reward": 1.0263671875, - "reward_std": 0.2901027798652649, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, + "grad_norm": 0.1371777057647705, + "kl": 0.0308837890625, + "learning_rate": 9.96679091547922e-07, + "loss": 0.149, + "num_tokens": 293810978.0, + "reward": 1.001953125, + "reward_std": 0.30599159002304077, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.1949920952320099, + "rewards/tag_count_reward/mean": 0.900390625, + "rewards/tag_count_reward/std": 0.2246759831905365, "step": 396 }, { @@ -11499,27 +11499,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.146484375, + "completions/clipped_ratio": 0.150390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1204.087890625, - "completions/mean_terminated_length": 1059.251708984375, - "completions/min_length": 214.0, - "completions/min_terminated_length": 214.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 1141.888671875, + "completions/mean_terminated_length": 981.4965209960938, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.13552957241614746, - "grad_norm": 0.7503846287727356, - "kl": 0.127197265625, - "learning_rate": 9.965503659332037e-07, - "loss": 0.1111, - "num_tokens": 269587771.0, - "reward": 1.03857421875, - "reward_std": 0.31176069378852844, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, + "grad_norm": 0.13772086799144745, + "kl": 0.0369873046875, + "learning_rate": 9.96613738689877e-07, + "loss": 0.1924, + "num_tokens": 294466729.0, + "reward": 1.0146484375, + "reward_std": 0.34111836552619934, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.2048913985490799, + "rewards/tag_count_reward/mean": 0.8798828125, + "rewards/tag_count_reward/std": 0.24970743060112, "step": 397 }, { @@ -11528,27 +11528,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08984375, + "completions/clipped_ratio": 0.12890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1140.435546875, - "completions/mean_terminated_length": 1050.84765625, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1051.92578125, + "completions/mean_terminated_length": 904.5247192382812, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.13587095672953828, - "grad_norm": 0.6859690546989441, - "kl": 0.0980224609375, - "learning_rate": 9.964837948218138e-07, - "loss": 0.1032, - "num_tokens": 270245450.0, - "reward": 1.080078125, - "reward_std": 0.21009346842765808, - "rewards/accuracy_reward/mean": 0.14453125, - "rewards/accuracy_reward/std": 0.35197147727012634, + "grad_norm": 0.1373293101787567, + "kl": 0.03533935546875, + "learning_rate": 9.965477514642797e-07, + "loss": 0.1332, + "num_tokens": 295079091.0, + "reward": 1.076171875, + "reward_std": 0.26283931732177734, + "rewards/accuracy_reward/mean": 0.177734375, + "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.16550962626934052, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.2275916188955307, "step": 398 }, { @@ -11557,27 +11557,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.126953125, + "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1248.32421875, - "completions/mean_terminated_length": 1132.040283203125, - "completions/min_length": 199.0, - "completions/min_terminated_length": 199.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1181.81640625, + "completions/mean_terminated_length": 1040.0772705078125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.13621234104292906, - "grad_norm": 2.203038215637207, - "kl": 0.1400146484375, - "learning_rate": 9.964165900083402e-07, - "loss": 0.1197, - "num_tokens": 270966608.0, - "reward": 1.05810546875, - "reward_std": 0.3260589838027954, - "rewards/accuracy_reward/mean": 0.142578125, - "rewards/accuracy_reward/std": 0.3499840497970581, + "grad_norm": 0.11741556972265244, + "kl": 0.031097412109375, + "learning_rate": 9.964811299648581e-07, + "loss": 0.135, + "num_tokens": 295766197.0, + "reward": 1.09375, + "reward_std": 0.37734484672546387, + "rewards/accuracy_reward/mean": 0.20703125, + "rewards/accuracy_reward/std": 0.40557438135147095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.19614213705062866, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.2394738495349884, "step": 399 }, { @@ -11586,27 +11586,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12109375, + "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1145.833984375, - "completions/mean_terminated_length": 1021.5355834960938, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1062.68359375, + "completions/mean_terminated_length": 921.9241333007812, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.13655372535631988, - "grad_norm": 7.622913837432861, - "kl": 0.250732421875, - "learning_rate": 9.963487515881678e-07, - "loss": 0.1629, - "num_tokens": 271637531.0, - "reward": 1.05419921875, - "reward_std": 0.2815008759498596, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, + "grad_norm": 0.13724146783351898, + "kl": 0.033111572265625, + "learning_rate": 9.964138742862408e-07, + "loss": 0.1326, + "num_tokens": 296394547.0, + "reward": 1.0693359375, + "reward_std": 0.31078818440437317, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.2032666653394699, + "rewards/tag_count_reward/mean": 0.9013671875, + "rewards/tag_count_reward/std": 0.22780774533748627, "step": 400 }, { @@ -11615,27 +11615,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.115234375, + "completions/clipped_ratio": 0.134765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 1098.95703125, - "completions/mean_terminated_length": 975.3510131835938, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1054.640625, + "completions/mean_terminated_length": 899.9187622070312, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.1368951096697107, - "grad_norm": 0.43843021988868713, - "kl": 0.1151123046875, - "learning_rate": 9.962802796575811e-07, - "loss": 0.1067, - "num_tokens": 272284245.0, - "reward": 1.11767578125, - "reward_std": 0.24404722452163696, - "rewards/accuracy_reward/mean": 0.185546875, - "rewards/accuracy_reward/std": 0.38912075757980347, + "grad_norm": 0.36762896180152893, + "kl": 0.0484619140625, + "learning_rate": 9.963459845239579e-07, + "loss": 0.1355, + "num_tokens": 297018571.0, + "reward": 1.080078125, + "reward_std": 0.2847171723842621, + "rewards/accuracy_reward/mean": 0.181640625, + "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.17669491469860077, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.2275916188955307, "step": 401 }, { @@ -11644,27 +11644,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.140625, + "completions/clipped_ratio": 0.154296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1223.072265625, - "completions/mean_terminated_length": 1088.0841064453125, - "completions/min_length": 301.0, - "completions/min_terminated_length": 301.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1154.470703125, + "completions/mean_terminated_length": 991.4480590820312, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.13723649398310148, - "grad_norm": 0.9861530065536499, - "kl": 0.133056640625, - "learning_rate": 9.96211174313763e-07, - "loss": 0.1246, - "num_tokens": 272975242.0, - "reward": 1.06884765625, - "reward_std": 0.28927093744277954, - "rewards/accuracy_reward/mean": 0.154296875, - "rewards/accuracy_reward/std": 0.36158639192581177, + "grad_norm": 0.12652191519737244, + "kl": 0.0291748046875, + "learning_rate": 9.962774607744387e-07, + "loss": 0.1448, + "num_tokens": 297674444.0, + "reward": 1.03466796875, + "reward_std": 0.32548218965530396, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.19571784138679504, + "rewards/tag_count_reward/mean": 0.89990234375, + "rewards/tag_count_reward/std": 0.22904333472251892, "step": 402 }, { @@ -11673,27 +11673,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.109375, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1087.255859375, - "completions/mean_terminated_length": 969.2697143554688, - "completions/min_length": 192.0, - "completions/min_terminated_length": 192.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1006.044921875, + "completions/mean_terminated_length": 862.4866943359375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.1375778782964923, - "grad_norm": 0.6500340700149536, - "kl": 0.1126708984375, - "learning_rate": 9.961414356547962e-07, - "loss": 0.0997, - "num_tokens": 273616125.0, - "reward": 1.09423828125, - "reward_std": 0.31288596987724304, - "rewards/accuracy_reward/mean": 0.171875, - "rewards/accuracy_reward/std": 0.3776407241821289, + "grad_norm": 0.17126622796058655, + "kl": 0.0369873046875, + "learning_rate": 9.962083031350148e-07, + "loss": 0.1525, + "num_tokens": 298273747.0, + "reward": 1.1142578125, + "reward_std": 0.3468095064163208, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4083731174468994, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.1855938732624054, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.22213320434093475, "step": 403 }, { @@ -11702,27 +11702,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1171875, + "completions/clipped_ratio": 0.130859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1189.474609375, - "completions/mean_terminated_length": 1075.5111083984375, - "completions/min_length": 232.0, - "completions/min_terminated_length": 232.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1123.775390625, + "completions/mean_terminated_length": 984.6224975585938, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.13791926260988308, - "grad_norm": 0.7278398871421814, - "kl": 0.099853515625, - "learning_rate": 9.960710637796617e-07, - "loss": 0.0855, - "num_tokens": 274301440.0, - "reward": 1.0654296875, - "reward_std": 0.28713470697402954, - "rewards/accuracy_reward/mean": 0.146484375, - "rewards/accuracy_reward/std": 0.35393697023391724, + "grad_norm": 0.1291474848985672, + "kl": 0.035247802734375, + "learning_rate": 9.961385117039167e-07, + "loss": 0.1149, + "num_tokens": 298925424.0, + "reward": 1.083984375, + "reward_std": 0.3407275080680847, + "rewards/accuracy_reward/mean": 0.189453125, + "rewards/accuracy_reward/std": 0.3922513723373413, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.1966511756181717, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.23378820717334747, "step": 404 }, { @@ -11731,27 +11731,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.087890625, + "completions/clipped_ratio": 0.134765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 1110.943359375, - "completions/mean_terminated_length": 1020.6488037109375, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1115.7109375, + "completions/mean_terminated_length": 970.5011596679688, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.1382606469232739, - "grad_norm": 0.843716561794281, - "kl": 0.09375, - "learning_rate": 9.960000587882396e-07, - "loss": 0.0897, - "num_tokens": 274946707.0, - "reward": 1.0576171875, - "reward_std": 0.21474316716194153, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, + "grad_norm": 0.1537109613418579, + "kl": 0.0352783203125, + "learning_rate": 9.960680865802762e-07, + "loss": 0.2194, + "num_tokens": 299573132.0, + "reward": 1.00146484375, + "reward_std": 0.30505937337875366, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.16507895290851593, + "rewards/tag_count_reward/mean": 0.89208984375, + "rewards/tag_count_reward/std": 0.2396608144044876, "step": 405 }, { @@ -11760,27 +11760,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.09375, + "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1081.494140625, - "completions/mean_terminated_length": 981.5107421875, - "completions/min_length": 243.0, - "completions/min_terminated_length": 243.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1062.830078125, + "completions/mean_terminated_length": 911.9482421875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, "epoch": 0.13860203123666467, - "grad_norm": 0.6134137511253357, - "kl": 0.1099853515625, - "learning_rate": 9.95928420781309e-07, - "loss": 0.0624, - "num_tokens": 275574352.0, - "reward": 1.1337890625, - "reward_std": 0.21922734379768372, - "rewards/accuracy_reward/mean": 0.18359375, - "rewards/accuracy_reward/std": 0.3875311613082886, + "grad_norm": 0.14802739024162292, + "kl": 0.033355712890625, + "learning_rate": 9.959970278641246e-07, + "loss": 0.1128, + "num_tokens": 300191221.0, + "reward": 1.0869140625, + "reward_std": 0.2655200660228729, + "rewards/accuracy_reward/mean": 0.181640625, + "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9501953125, - "rewards/tag_count_reward/std": 0.1406233012676239, + "rewards/tag_count_reward/mean": 0.9052734375, + "rewards/tag_count_reward/std": 0.21515825390815735, "step": 406 }, { @@ -11789,27 +11789,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1171875, + "completions/clipped_ratio": 0.21484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 1095.412109375, - "completions/mean_terminated_length": 968.96240234375, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1172.5234375, + "completions/mean_terminated_length": 932.9651489257812, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, "epoch": 0.1389434155500555, - "grad_norm": 12.477898597717285, - "kl": 0.269775390625, - "learning_rate": 9.95856149860546e-07, - "loss": 0.1398, - "num_tokens": 276222131.0, - "reward": 1.05126953125, - "reward_std": 0.2644086480140686, - "rewards/accuracy_reward/mean": 0.13306452333927155, - "rewards/accuracy_reward/std": 0.3399873673915863, + "grad_norm": 0.16850464046001434, + "kl": 0.04071044921875, + "learning_rate": 9.959253356563931e-07, + "loss": 0.2187, + "num_tokens": 300878481.0, + "reward": 0.958984375, + "reward_std": 0.3165563941001892, + "rewards/accuracy_reward/mean": 0.13508065044879913, + "rewards/accuracy_reward/std": 0.3421548008918762, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.19460150599479675, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.2934376895427704, "step": 407 }, { @@ -11818,27 +11818,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.123046875, + "completions/clipped_ratio": 0.173828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1148.38671875, - "completions/mean_terminated_length": 1022.160400390625, - "completions/min_length": 28.0, - "completions/min_terminated_length": 28.0, - "epoch": 0.13928479986344627, - "grad_norm": 2.7371554374694824, - "kl": 0.194091796875, - "learning_rate": 9.957832461285267e-07, - "loss": 0.1006, - "num_tokens": 276897081.0, - "reward": 0.99755859375, - "reward_std": 0.250779390335083, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1132.078125, + "completions/mean_terminated_length": 939.366455078125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.13928479986344627, + "grad_norm": 0.1512688398361206, + "kl": 0.03875732421875, + "learning_rate": 9.958530100589131e-07, + "loss": 0.1723, + "num_tokens": 301545081.0, + "reward": 0.9375, + "reward_std": 0.3331637382507324, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.1818491518497467, + "rewards/tag_count_reward/mean": 0.857421875, + "rewards/tag_count_reward/std": 0.2721293866634369, "step": 408 }, { @@ -11847,27 +11847,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0703125, + "completions/clipped_ratio": 0.123046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1098.951171875, - "completions/mean_terminated_length": 1027.1744384765625, - "completions/min_length": 266.0, - "completions/min_terminated_length": 266.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1062.703125, + "completions/mean_terminated_length": 924.454345703125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.1396261841768371, - "grad_norm": 0.6406298875808716, - "kl": 0.1231689453125, - "learning_rate": 9.957097096887246e-07, - "loss": 0.0937, - "num_tokens": 277536512.0, - "reward": 1.0390625, - "reward_std": 0.2268882691860199, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, + "grad_norm": 0.14967599511146545, + "kl": 0.0372314453125, + "learning_rate": 9.957800511744153e-07, + "loss": 0.184, + "num_tokens": 302165953.0, + "reward": 0.99853515625, + "reward_std": 0.29199162125587463, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94921875, - "rewards/tag_count_reward/std": 0.15118376910686493, + "rewards/tag_count_reward/mean": 0.89892578125, + "rewards/tag_count_reward/std": 0.22754070162773132, "step": 409 }, { @@ -11876,27 +11876,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.087890625, + "completions/clipped_ratio": 0.134765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1129.705078125, - "completions/mean_terminated_length": 1041.2183837890625, - "completions/min_length": 219.0, - "completions/min_terminated_length": 219.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1097.9140625, + "completions/mean_terminated_length": 949.9323120117188, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.13996756849022787, - "grad_norm": 1.1640084981918335, - "kl": 0.156494140625, - "learning_rate": 9.95635540645511e-07, - "loss": 0.0856, - "num_tokens": 278205705.0, - "reward": 1.0146484375, - "reward_std": 0.23164458572864532, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, + "grad_norm": 0.2295297086238861, + "kl": 0.0372314453125, + "learning_rate": 9.957064591065301e-07, + "loss": 0.1574, + "num_tokens": 302818869.0, + "reward": 0.97705078125, + "reward_std": 0.27609753608703613, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9306640625, - "rewards/tag_count_reward/std": 0.1812576949596405, + "rewards/tag_count_reward/mean": 0.88330078125, + "rewards/tag_count_reward/std": 0.24466808140277863, "step": 410 }, { @@ -11905,27 +11905,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1050.11328125, - "completions/mean_terminated_length": 996.7283935546875, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1041.005859375, + "completions/mean_terminated_length": 941.60302734375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.1403089528036187, - "grad_norm": 0.9455592036247253, - "kl": 0.128662109375, - "learning_rate": 9.95560739104155e-07, - "loss": 0.0635, - "num_tokens": 278820243.0, - "reward": 1.16796875, - "reward_std": 0.24406027793884277, - "rewards/accuracy_reward/mean": 0.205078125, - "rewards/accuracy_reward/std": 0.4041535556316376, + "grad_norm": 0.1434800773859024, + "kl": 0.036102294921875, + "learning_rate": 9.956322339597874e-07, + "loss": 0.1088, + "num_tokens": 303428744.0, + "reward": 1.16552734375, + "reward_std": 0.2934776544570923, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.42882615327835083, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.962890625, - "rewards/tag_count_reward/std": 0.12740769982337952, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.20656347274780273, "step": 411 }, { @@ -11934,27 +11934,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.103515625, + "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 1094.158203125, - "completions/mean_terminated_length": 984.0195922851562, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1103.9140625, + "completions/mean_terminated_length": 969.044677734375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.14065033711700947, - "grad_norm": 1.8893964290618896, - "kl": 0.1865234375, - "learning_rate": 9.95485305170824e-07, - "loss": 0.0947, - "num_tokens": 279457684.0, - "reward": 1.0546875, - "reward_std": 0.24043290317058563, - "rewards/accuracy_reward/mean": 0.11895161122083664, - "rewards/accuracy_reward/std": 0.3240584135055542, + "grad_norm": 0.21864053606987, + "kl": 0.037078857421875, + "learning_rate": 9.955573758396162e-07, + "loss": 0.1377, + "num_tokens": 304071180.0, + "reward": 1.02392578125, + "reward_std": 0.28571438789367676, + "rewards/accuracy_reward/mean": 0.1270161271095276, + "rewards/accuracy_reward/std": 0.33332720398902893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.16550962626934052, + "rewards/tag_count_reward/mean": 0.90087890625, + "rewards/tag_count_reward/std": 0.22188088297843933, "step": 412 }, { @@ -11963,27 +11963,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.07421875, + "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1008.185546875, - "completions/mean_terminated_length": 924.8248291015625, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1034.455078125, + "completions/mean_terminated_length": 889.6629638671875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.14099172143040029, - "grad_norm": 0.7469418048858643, - "kl": 0.1116943359375, - "learning_rate": 9.95409238952583e-07, - "loss": 0.1202, - "num_tokens": 280042723.0, - "reward": 1.1484375, - "reward_std": 0.2419055700302124, - "rewards/accuracy_reward/mean": 0.203125, - "rewards/accuracy_reward/std": 0.4027182459831238, + "grad_norm": 0.15414918959140778, + "kl": 0.035614013671875, + "learning_rate": 9.954818848523442e-07, + "loss": 0.1777, + "num_tokens": 304669669.0, + "reward": 1.080078125, + "reward_std": 0.294488787651062, + "rewards/accuracy_reward/mean": 0.189453125, + "rewards/accuracy_reward/std": 0.3922513723373413, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.16007427871227264, + "rewards/tag_count_reward/mean": 0.890625, + "rewards/tag_count_reward/std": 0.24530823528766632, "step": 413 }, { @@ -11992,27 +11992,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.078125, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1070.05859375, - "completions/mean_terminated_length": 987.1821899414062, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1044.52734375, + "completions/mean_terminated_length": 923.7593383789062, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.14133310574379107, - "grad_norm": 0.821952223777771, - "kl": 0.1478271484375, - "learning_rate": 9.953325405573935e-07, - "loss": 0.1546, - "num_tokens": 280678113.0, - "reward": 1.08203125, - "reward_std": 0.24936708807945251, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, + "grad_norm": 0.1796736866235733, + "kl": 0.03643798828125, + "learning_rate": 9.954057611051986e-07, + "loss": 0.1675, + "num_tokens": 305291987.0, + "reward": 1.06005859375, + "reward_std": 0.30217552185058594, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.951171875, - "rewards/tag_count_reward/std": 0.15101934969425201, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.21676163375377655, "step": 414 }, { @@ -12021,27 +12021,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.083984375, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1101.2890625, - "completions/mean_terminated_length": 1014.4904174804688, - "completions/min_length": 238.0, - "completions/min_terminated_length": 238.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1102.021484375, + "completions/mean_terminated_length": 976.4490966796875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, "epoch": 0.14167449005718188, - "grad_norm": 0.9400856494903564, - "kl": 0.1046142578125, - "learning_rate": 9.952552100941155e-07, - "loss": 0.1123, - "num_tokens": 281315973.0, - "reward": 1.08935546875, - "reward_std": 0.26338285207748413, - "rewards/accuracy_reward/mean": 0.142578125, - "rewards/accuracy_reward/std": 0.3499840497970581, + "grad_norm": 0.1292155236005783, + "kl": 0.035797119140625, + "learning_rate": 9.953290047063054e-07, + "loss": 0.163, + "num_tokens": 305930222.0, + "reward": 1.02734375, + "reward_std": 0.29956069588661194, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94677734375, - "rewards/tag_count_reward/std": 0.1531609743833542, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.22059866786003113, "step": 415 }, { @@ -12050,27 +12050,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.080078125, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1065.1875, - "completions/mean_terminated_length": 979.6348876953125, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1050.326171875, + "completions/mean_terminated_length": 920.3862915039062, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, "epoch": 0.14201587437057267, - "grad_norm": 1.530302882194519, - "kl": 0.161865234375, - "learning_rate": 9.951772476725047e-07, - "loss": 0.1158, - "num_tokens": 281939077.0, - "reward": 1.126953125, - "reward_std": 0.24623417854309082, - "rewards/accuracy_reward/mean": 0.19140625, - "rewards/accuracy_reward/std": 0.3937928080558777, + "grad_norm": 0.6303587555885315, + "kl": 0.07183837890625, + "learning_rate": 9.952516157646884e-07, + "loss": 0.1541, + "num_tokens": 306545717.0, + "reward": 1.12158203125, + "reward_std": 0.2818402349948883, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41380295157432556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.18036192655563354, + "rewards/tag_count_reward/mean": 0.90283203125, + "rewards/tag_count_reward/std": 0.2313636690378189, "step": 416 }, { @@ -12079,27 +12079,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.10546875, + "completions/clipped_ratio": 0.134765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 1130.302734375, - "completions/mean_terminated_length": 1022.1026611328125, - "completions/min_length": 250.0, - "completions/min_terminated_length": 250.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1147.41015625, + "completions/mean_terminated_length": 1007.1376953125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, "epoch": 0.14235725868396348, - "grad_norm": 0.9781035780906677, - "kl": 0.0975341796875, - "learning_rate": 9.950986534032149e-07, - "loss": 0.0945, - "num_tokens": 282589616.0, - "reward": 1.0986328125, - "reward_std": 0.33007973432540894, - "rewards/accuracy_reward/mean": 0.171875, - "rewards/accuracy_reward/std": 0.3776407241821289, + "grad_norm": 0.12928138673305511, + "kl": 0.03466796875, + "learning_rate": 9.951735943902704e-07, + "loss": 0.1337, + "num_tokens": 307205015.0, + "reward": 1.10791015625, + "reward_std": 0.3449528217315674, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.19347688555717468, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.21408694982528687, "step": 417 }, { @@ -12108,27 +12108,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08984375, + "completions/clipped_ratio": 0.123046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1083.5, - "completions/mean_terminated_length": 988.2918701171875, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1065.63671875, + "completions/mean_terminated_length": 927.799560546875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, "epoch": 0.14269864299735427, - "grad_norm": 0.8044054508209229, - "kl": 0.116943359375, - "learning_rate": 9.950194273977964e-07, - "loss": 0.0649, - "num_tokens": 283220176.0, - "reward": 0.990234375, - "reward_std": 0.21570071578025818, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, + "grad_norm": 0.1417904496192932, + "kl": 0.0341796875, + "learning_rate": 9.95094940693873e-07, + "loss": 0.1148, + "num_tokens": 307826429.0, + "reward": 0.96826171875, + "reward_std": 0.2755052447319031, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.16402500867843628, + "rewards/tag_count_reward/mean": 0.90576171875, + "rewards/tag_count_reward/std": 0.22180765867233276, "step": 418 }, { @@ -12137,27 +12137,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0703125, + "completions/clipped_ratio": 0.111328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1052.771484375, - "completions/mean_terminated_length": 977.5021362304688, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1094.279296875, + "completions/mean_terminated_length": 974.80224609375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.14304002731074508, - "grad_norm": 0.7905445098876953, - "kl": 0.1112060546875, - "learning_rate": 9.949395697686958e-07, - "loss": 0.0735, - "num_tokens": 283831259.0, - "reward": 1.1220703125, - "reward_std": 0.2493770867586136, - "rewards/accuracy_reward/mean": 0.16796875, - "rewards/accuracy_reward/std": 0.374204158782959, + "grad_norm": 0.12677240371704102, + "kl": 0.030975341796875, + "learning_rate": 9.95015654787215e-07, + "loss": 0.0976, + "num_tokens": 308458764.0, + "reward": 1.10986328125, + "reward_std": 0.27914077043533325, + "rewards/accuracy_reward/mean": 0.193359375, + "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9541015625, - "rewards/tag_count_reward/std": 0.14366182684898376, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.20804768800735474, "step": 419 }, { @@ -12166,27 +12166,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.087890625, + "completions/clipped_ratio": 0.1484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1091.595703125, - "completions/mean_terminated_length": 999.4368286132812, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1109.974609375, + "completions/mean_terminated_length": 946.465576171875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.14338141162413587, - "grad_norm": 0.7350541353225708, - "kl": 0.151123046875, - "learning_rate": 9.948590806292565e-07, - "loss": 0.0946, - "num_tokens": 284474604.0, - "reward": 0.97607421875, - "reward_std": 0.1850041151046753, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, + "grad_norm": 0.14157600700855255, + "kl": 0.037841796875, + "learning_rate": 9.94935736782914e-07, + "loss": 0.1489, + "num_tokens": 309111519.0, + "reward": 0.9443359375, + "reward_std": 0.25531116127967834, + "rewards/accuracy_reward/mean": 0.052419353276491165, + "rewards/accuracy_reward/std": 0.22309619188308716, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.16347575187683105, + "rewards/tag_count_reward/mean": 0.8935546875, + "rewards/tag_count_reward/std": 0.23334431648254395, "step": 420 }, { @@ -12195,27 +12195,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.10546875, + "completions/clipped_ratio": 0.119140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1126.125, - "completions/mean_terminated_length": 1017.4323120117188, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1115.642578125, + "completions/mean_terminated_length": 989.5365600585938, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, "epoch": 0.14372279593752668, - "grad_norm": 19.234102249145508, - "kl": 0.6923828125, - "learning_rate": 9.947779600937181e-07, - "loss": 0.1407, - "num_tokens": 285135980.0, - "reward": 1.06689453125, - "reward_std": 0.27593308687210083, - "rewards/accuracy_reward/mean": 0.134765625, - "rewards/accuracy_reward/std": 0.3418070077896118, + "grad_norm": 1.5215121507644653, + "kl": 0.04486083984375, + "learning_rate": 9.948551867944848e-07, + "loss": 0.1294, + "num_tokens": 309767528.0, + "reward": 1.0654296875, + "reward_std": 0.3083324432373047, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.1794423907995224, + "rewards/tag_count_reward/mean": 0.9091796875, + "rewards/tag_count_reward/std": 0.21964147686958313, "step": 421 }, { @@ -12224,27 +12224,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.087890625, + "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 1111.66015625, - "completions/mean_terminated_length": 1021.4346923828125, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1124.982421875, + "completions/mean_terminated_length": 993.122802734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, "epoch": 0.14406418025091747, - "grad_norm": 33.088340759277344, - "kl": 0.493408203125, - "learning_rate": 9.946962082772163e-07, - "loss": 0.1197, - "num_tokens": 285783246.0, - "reward": 1.00830078125, - "reward_std": 0.2411033660173416, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, + "grad_norm": 0.14088740944862366, + "kl": 0.03472900390625, + "learning_rate": 9.947740049363404e-07, + "loss": 0.1804, + "num_tokens": 310421615.0, + "reward": 0.99951171875, + "reward_std": 0.30131995677948, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.1800537258386612, + "rewards/tag_count_reward/mean": 0.89599609375, + "rewards/tag_count_reward/std": 0.23208530247211456, "step": 422 }, { @@ -12253,27 +12253,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1027.78515625, - "completions/mean_terminated_length": 959.7708740234375, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1026.978515625, + "completions/mean_terminated_length": 914.0238647460938, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, "epoch": 0.14440556456430828, - "grad_norm": 9.780620574951172, - "kl": 0.3726806640625, - "learning_rate": 9.946138252957827e-07, - "loss": 0.0718, - "num_tokens": 286387408.0, - "reward": 1.12353515625, - "reward_std": 0.2738574743270874, - "rewards/accuracy_reward/mean": 0.177734375, - "rewards/accuracy_reward/std": 0.3826628625392914, + "grad_norm": 0.15640226006507874, + "kl": 0.0390625, + "learning_rate": 9.946921913237908e-07, + "loss": 0.0839, + "num_tokens": 311025364.0, + "reward": 1.09130859375, + "reward_std": 0.320438414812088, + "rewards/accuracy_reward/mean": 0.181640625, + "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94580078125, - "rewards/tag_count_reward/std": 0.15985849499702454, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.21844784915447235, "step": 423 }, { @@ -12282,27 +12282,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.07421875, + "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1135.7109375, - "completions/mean_terminated_length": 1062.57373046875, - "completions/min_length": 234.0, - "completions/min_terminated_length": 234.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1140.55078125, + "completions/mean_terminated_length": 1001.5720825195312, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.14474694887769907, - "grad_norm": 2.2094523906707764, - "kl": 0.220947265625, - "learning_rate": 9.945308112663455e-07, - "loss": 0.0801, - "num_tokens": 287045020.0, - "reward": 1.0849609375, - "reward_std": 0.2298390418291092, - "rewards/accuracy_reward/mean": 0.138671875, - "rewards/accuracy_reward/std": 0.34594178199768066, + "grad_norm": 0.11658474057912827, + "kl": 0.0316162109375, + "learning_rate": 9.946097460730436e-07, + "loss": 0.1107, + "num_tokens": 311685454.0, + "reward": 1.06005859375, + "reward_std": 0.3215235471725464, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9462890625, - "rewards/tag_count_reward/std": 0.15178616344928741, + "rewards/tag_count_reward/mean": 0.89599609375, + "rewards/tag_count_reward/std": 0.22513006627559662, "step": 424 }, { @@ -12311,27 +12311,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.068359375, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 1094.0234375, - "completions/mean_terminated_length": 1024.025146484375, - "completions/min_length": 212.0, - "completions/min_terminated_length": 212.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1126.42578125, + "completions/mean_terminated_length": 999.453369140625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.14508833319108988, - "grad_norm": 1.2251715660095215, - "kl": 0.12353515625, - "learning_rate": 9.94447166306727e-07, - "loss": 0.1319, - "num_tokens": 287689400.0, - "reward": 1.04296875, - "reward_std": 0.2499944269657135, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, + "grad_norm": 0.14736075699329376, + "kl": 0.03631591796875, + "learning_rate": 9.945266693012037e-07, + "loss": 0.1691, + "num_tokens": 312346424.0, + "reward": 1.01318359375, + "reward_std": 0.29328978061676025, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.1734480857849121, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.22316910326480865, "step": 425 }, { @@ -12340,27 +12340,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.095703125, + "completions/clipped_ratio": 0.138671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1104.681640625, - "completions/mean_terminated_length": 1004.8487548828125, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1114.875, + "completions/mean_terminated_length": 964.6439819335938, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.14542971750448067, - "grad_norm": 0.9261499047279358, - "kl": 0.15966796875, - "learning_rate": 9.94362890535647e-07, - "loss": 0.1016, - "num_tokens": 288330581.0, - "reward": 1.04736328125, - "reward_std": 0.27268338203430176, - "rewards/accuracy_reward/mean": 0.11491935700178146, - "rewards/accuracy_reward/std": 0.3192465901374817, + "grad_norm": 0.13163742423057556, + "kl": 0.034912109375, + "learning_rate": 9.944429611262728e-07, + "loss": 0.1491, + "num_tokens": 312992824.0, + "reward": 1.02978515625, + "reward_std": 0.3319040536880493, + "rewards/accuracy_reward/mean": 0.14314515888690948, + "rewards/accuracy_reward/std": 0.35057440400123596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.16384358704090118, + "rewards/tag_count_reward/mean": 0.89111328125, + "rewards/tag_count_reward/std": 0.23870600759983063, "step": 426 }, { @@ -12369,27 +12369,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.080078125, + "completions/clipped_ratio": 0.126953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1078.265625, - "completions/mean_terminated_length": 993.8514404296875, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1063.32421875, + "completions/mean_terminated_length": 920.1387329101562, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.14577110181787148, - "grad_norm": 14.89603328704834, - "kl": 0.5328369140625, - "learning_rate": 9.942779840727185e-07, - "loss": 0.1138, - "num_tokens": 288961213.0, - "reward": 1.04638671875, - "reward_std": 0.2174864411354065, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, + "grad_norm": 0.1511382907629013, + "kl": 0.036834716796875, + "learning_rate": 9.943586216671493e-07, + "loss": 0.1876, + "num_tokens": 313615806.0, + "reward": 1.00439453125, + "reward_std": 0.29750609397888184, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.1532669961452484, + "rewards/tag_count_reward/mean": 0.88916015625, + "rewards/tag_count_reward/std": 0.24289245903491974, "step": 427 }, { @@ -12398,27 +12398,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1067.38671875, - "completions/mean_terminated_length": 999.828857421875, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1062.6328125, + "completions/mean_terminated_length": 944.0437622070312, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, "epoch": 0.14611248613126226, - "grad_norm": 2.4918482303619385, - "kl": 0.1580810546875, - "learning_rate": 9.941924470384515e-07, - "loss": 0.064, - "num_tokens": 289578515.0, - "reward": 1.197265625, - "reward_std": 0.26387596130371094, - "rewards/accuracy_reward/mean": 0.240234375, - "rewards/accuracy_reward/std": 0.4276435375213623, + "grad_norm": 0.14157192409038544, + "kl": 0.03338623046875, + "learning_rate": 9.942736510436285e-07, + "loss": 0.14, + "num_tokens": 314230674.0, + "reward": 1.16650390625, + "reward_std": 0.31410905718803406, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.43343618512153625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95703125, - "rewards/tag_count_reward/std": 0.13403046131134033, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.20980393886566162, "step": 428 }, { @@ -12427,27 +12427,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.091796875, + "completions/clipped_ratio": 0.13671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1083.28515625, - "completions/mean_terminated_length": 985.7763671875, - "completions/min_length": 214.0, - "completions/min_terminated_length": 214.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1104.36328125, + "completions/mean_terminated_length": 954.9185791015625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.14645387044465308, - "grad_norm": 0.9718239307403564, - "kl": 0.1097412109375, - "learning_rate": 9.941062795542496e-07, - "loss": 0.0867, - "num_tokens": 290212341.0, - "reward": 1.0380859375, - "reward_std": 0.21170061826705933, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.1779806911945343, + "grad_norm": 0.1387367695569992, + "kl": 0.033233642578125, + "learning_rate": 9.941880493764027e-07, + "loss": 0.1513, + "num_tokens": 314875292.0, + "reward": 0.99609375, + "reward_std": 0.2841317653656006, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.892578125, + "rewards/tag_count_reward/std": 0.23289547860622406, "step": 429 }, { @@ -12456,27 +12456,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.068359375, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 1034.599609375, - "completions/mean_terminated_length": 960.2410278320312, - "completions/min_length": 199.0, - "completions/min_terminated_length": 199.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1051.76171875, + "completions/mean_terminated_length": 922.0088500976562, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, "epoch": 0.14679525475804386, - "grad_norm": 0.8444366455078125, - "kl": 0.1531982421875, - "learning_rate": 9.94019481742412e-07, - "loss": 0.1347, - "num_tokens": 290828648.0, - "reward": 1.11376953125, - "reward_std": 0.2682647109031677, - "rewards/accuracy_reward/mean": 0.162109375, - "rewards/accuracy_reward/std": 0.3689115643501282, + "grad_norm": 0.14188094437122345, + "kl": 0.0350341796875, + "learning_rate": 9.941018167870596e-07, + "loss": 0.0986, + "num_tokens": 315500386.0, + "reward": 1.05224609375, + "reward_std": 0.32542142271995544, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95166015625, - "rewards/tag_count_reward/std": 0.1523853987455368, + "rewards/tag_count_reward/mean": 0.89208984375, + "rewards/tag_count_reward/std": 0.24067933857440948, "step": 430 }, { @@ -12485,27 +12485,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.068359375, + "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 1074.7421875, - "completions/mean_terminated_length": 1003.3291015625, - "completions/min_length": 229.0, - "completions/min_terminated_length": 229.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1077.369140625, + "completions/mean_terminated_length": 967.6456298828125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, "epoch": 0.14713663907143468, - "grad_norm": 0.5404824614524841, - "kl": 0.1015625, - "learning_rate": 9.939320537261326e-07, - "loss": 0.0893, - "num_tokens": 291461028.0, - "reward": 1.11962890625, - "reward_std": 0.1981087028980255, - "rewards/accuracy_reward/mean": 0.166015625, - "rewards/accuracy_reward/std": 0.3724585771560669, + "grad_norm": 0.1448766440153122, + "kl": 0.03277587890625, + "learning_rate": 9.94014953398083e-07, + "loss": 0.1235, + "num_tokens": 316134111.0, + "reward": 1.0830078125, + "reward_std": 0.25815266370773315, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95361328125, - "rewards/tag_count_reward/std": 0.13785336911678314, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.21008896827697754, "step": 431 }, { @@ -12514,27 +12514,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.072265625, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1048.94140625, - "completions/mean_terminated_length": 971.1199340820312, - "completions/min_length": 41.0, - "completions/min_terminated_length": 41.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1071.845703125, + "completions/mean_terminated_length": 944.7086181640625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, "epoch": 0.14747802338482546, - "grad_norm": 0.3985885679721832, - "kl": 0.14013671875, - "learning_rate": 9.93843995629499e-07, - "loss": 0.0741, - "num_tokens": 292083702.0, - "reward": 1.03857421875, - "reward_std": 0.20800724625587463, - "rewards/accuracy_reward/mean": 0.0947580635547638, - "rewards/accuracy_reward/std": 0.29317617416381836, + "grad_norm": 0.15038932859897614, + "kl": 0.03570556640625, + "learning_rate": 9.939274593328542e-07, + "loss": 0.0935, + "num_tokens": 316768512.0, + "reward": 1.01806640625, + "reward_std": 0.25878143310546875, + "rewards/accuracy_reward/mean": 0.11088709533214569, + "rewards/accuracy_reward/std": 0.3143092691898346, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94677734375, - "rewards/tag_count_reward/std": 0.15155541896820068, + "rewards/tag_count_reward/mean": 0.91064453125, + "rewards/tag_count_reward/std": 0.21490219235420227, "step": 432 }, { @@ -12543,27 +12543,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.09375, + "completions/clipped_ratio": 0.18359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 1123.486328125, - "completions/mean_terminated_length": 1027.846923828125, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1160.26171875, + "completions/mean_terminated_length": 960.6267700195312, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.14781940769821628, - "grad_norm": 0.6801993250846863, - "kl": 0.17626953125, - "learning_rate": 9.937553075774938e-07, - "loss": 0.1014, - "num_tokens": 292741615.0, - "reward": 1.0771484375, - "reward_std": 0.24764740467071533, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, + "grad_norm": 0.1468881368637085, + "kl": 0.03759765625, + "learning_rate": 9.938393347156485e-07, + "loss": 0.1812, + "num_tokens": 317445254.0, + "reward": 0.98486328125, + "reward_std": 0.3092552423477173, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.15834276378154755, + "rewards/tag_count_reward/mean": 0.85009765625, + "rewards/tag_count_reward/std": 0.26974835991859436, "step": 433 }, { @@ -12572,27 +12572,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.13671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1087.423828125, - "completions/mean_terminated_length": 1031.853271484375, - "completions/min_length": 243.0, - "completions/min_terminated_length": 243.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1122.84375, + "completions/mean_terminated_length": 976.3258056640625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.14816079201160706, - "grad_norm": 13.213711738586426, - "kl": 0.439697265625, - "learning_rate": 9.936659896959935e-07, - "loss": 0.0976, - "num_tokens": 293375336.0, - "reward": 1.03515625, - "reward_std": 0.23112015426158905, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, + "grad_norm": 0.2122993767261505, + "kl": 0.038818359375, + "learning_rate": 9.93750579671638e-07, + "loss": 0.1848, + "num_tokens": 318097110.0, + "reward": 0.95751953125, + "reward_std": 0.32212772965431213, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.15313054621219635, + "rewards/tag_count_reward/mean": 0.87353515625, + "rewards/tag_count_reward/std": 0.25315582752227783, "step": 434 }, { @@ -12601,27 +12601,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.10546875, + "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1145.2109375, - "completions/mean_terminated_length": 1038.7685546875, - "completions/min_length": 255.0, - "completions/min_terminated_length": 255.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1159.16796875, + "completions/mean_terminated_length": 1032.1920166015625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.14850217632499788, - "grad_norm": 0.8606828451156616, - "kl": 0.2158203125, - "learning_rate": 9.935760421117686e-07, - "loss": 0.0989, - "num_tokens": 294041028.0, - "reward": 1.04541015625, - "reward_std": 0.2330261766910553, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, + "grad_norm": 0.1583891659975052, + "kl": 0.0369873046875, + "learning_rate": 9.936611943268895e-07, + "loss": 0.1416, + "num_tokens": 318769948.0, + "reward": 1.02978515625, + "reward_std": 0.3188987374305725, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93994140625, - "rewards/tag_count_reward/std": 0.17467160522937775, + "rewards/tag_count_reward/mean": 0.89111328125, + "rewards/tag_count_reward/std": 0.23404912650585175, "step": 435 }, { @@ -12630,27 +12630,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 1060.1171875, - "completions/mean_terminated_length": 996.4490966796875, - "completions/min_length": 196.0, - "completions/min_terminated_length": 196.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1050.26953125, + "completions/mean_terminated_length": 949.4237060546875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.14884356063838866, - "grad_norm": 1.6122773885726929, - "kl": 0.2325439453125, - "learning_rate": 9.93485464952483e-07, - "loss": 0.0901, - "num_tokens": 294661440.0, - "reward": 1.1298828125, - "reward_std": 0.26418039202690125, - "rewards/accuracy_reward/mean": 0.173828125, - "rewards/accuracy_reward/std": 0.3793322443962097, + "grad_norm": 0.14156770706176758, + "kl": 0.03759765625, + "learning_rate": 9.935711788083654e-07, + "loss": 0.1092, + "num_tokens": 319385318.0, + "reward": 1.14990234375, + "reward_std": 0.33361732959747314, + "rewards/accuracy_reward/mean": 0.244140625, + "rewards/accuracy_reward/std": 0.42999663949012756, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9560546875, - "rewards/tag_count_reward/std": 0.1346244364976883, + "rewards/tag_count_reward/mean": 0.90576171875, + "rewards/tag_count_reward/std": 0.22180765867233276, "step": 436 }, { @@ -12659,27 +12659,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.09765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 968.67578125, - "completions/mean_terminated_length": 922.5133056640625, - "completions/min_length": 224.0, - "completions/min_terminated_length": 224.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1003.669921875, + "completions/mean_terminated_length": 890.6471557617188, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.14918494495177947, - "grad_norm": 18.031034469604492, - "kl": 0.348876953125, - "learning_rate": 9.933942583466944e-07, - "loss": 0.0767, - "num_tokens": 295239338.0, - "reward": 1.09326171875, - "reward_std": 0.1853155493736267, - "rewards/accuracy_reward/mean": 0.1391129046678543, - "rewards/accuracy_reward/std": 0.34641367197036743, + "grad_norm": 0.13414405286312103, + "kl": 0.03759765625, + "learning_rate": 9.934805332439238e-07, + "loss": 0.0956, + "num_tokens": 319981133.0, + "reward": 1.07177734375, + "reward_std": 0.2685449719429016, + "rewards/accuracy_reward/mean": 0.16129031777381897, + "rewards/accuracy_reward/std": 0.3681698739528656, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95849609375, - "rewards/tag_count_reward/std": 0.13674680888652802, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.2174350470304489, "step": 437 }, { @@ -12688,27 +12688,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1049.7109375, - "completions/mean_terminated_length": 989.7722778320312, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1024.37890625, + "completions/mean_terminated_length": 911.1366577148438, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.14952632926517026, - "grad_norm": 134.8598175048828, - "kl": 1.400146484375, - "learning_rate": 9.93302422423854e-07, - "loss": 0.1738, - "num_tokens": 295856694.0, - "reward": 1.04345703125, - "reward_std": 0.24107421934604645, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, + "grad_norm": 0.1496797502040863, + "kl": 0.03826904296875, + "learning_rate": 9.933892577623165e-07, + "loss": 0.1449, + "num_tokens": 320585519.0, + "reward": 1.044921875, + "reward_std": 0.2584763169288635, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.1710955947637558, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.21867573261260986, "step": 438 }, { @@ -12717,27 +12717,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.080078125, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 1089.255859375, - "completions/mean_terminated_length": 1005.79833984375, - "completions/min_length": 212.0, - "completions/min_terminated_length": 212.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1102.052734375, + "completions/mean_terminated_length": 990.5218505859375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.14986771357856107, - "grad_norm": 12.19244384765625, - "kl": 0.2711181640625, - "learning_rate": 9.93209957314306e-07, - "loss": 0.1123, - "num_tokens": 296485497.0, - "reward": 0.99951171875, - "reward_std": 0.21098214387893677, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, + "grad_norm": 0.14976739883422852, + "kl": 0.03509521484375, + "learning_rate": 9.932973524931909e-07, + "loss": 0.1653, + "num_tokens": 321220874.0, + "reward": 0.98876953125, + "reward_std": 0.2523287534713745, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.17007611691951752, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.21169914305210114, "step": 439 }, { @@ -12746,27 +12746,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 1030.8984375, - "completions/mean_terminated_length": 972.0578002929688, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1010.55859375, + "completions/mean_terminated_length": 908.1502075195312, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.15020909789195186, - "grad_norm": 1.0037941932678223, - "kl": 0.201171875, - "learning_rate": 9.93116863149288e-07, - "loss": 0.1055, - "num_tokens": 297094325.0, - "reward": 1.13818359375, - "reward_std": 0.266563355922699, - "rewards/accuracy_reward/mean": 0.181640625, - "rewards/accuracy_reward/std": 0.38592514395713806, + "grad_norm": 0.14024309813976288, + "kl": 0.034698486328125, + "learning_rate": 9.932048175670886e-07, + "loss": 0.1466, + "num_tokens": 321819288.0, + "reward": 1.14599609375, + "reward_std": 0.3399246633052826, + "rewards/accuracy_reward/mean": 0.224609375, + "rewards/accuracy_reward/std": 0.41773295402526855, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95654296875, - "rewards/tag_count_reward/std": 0.1405578851699829, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.21512438356876373, "step": 440 }, { @@ -12775,27 +12775,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0703125, + "completions/clipped_ratio": 0.111328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1062.68359375, - "completions/mean_terminated_length": 988.1639404296875, - "completions/min_length": 250.0, - "completions/min_terminated_length": 250.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1056.2265625, + "completions/mean_terminated_length": 931.9824829101562, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, "epoch": 0.15055048220534267, - "grad_norm": 1.679003357887268, - "kl": 0.250244140625, - "learning_rate": 9.9302314006093e-07, - "loss": 0.119, - "num_tokens": 297706547.0, - "reward": 1.1123046875, - "reward_std": 0.26775506138801575, - "rewards/accuracy_reward/mean": 0.16015625, - "rewards/accuracy_reward/std": 0.3671095669269562, + "grad_norm": 0.14226427674293518, + "kl": 0.03460693359375, + "learning_rate": 9.931116531154458e-07, + "loss": 0.1441, + "num_tokens": 322428204.0, + "reward": 1.03759765625, + "reward_std": 0.2947860360145569, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9521484375, - "rewards/tag_count_reward/std": 0.14387451112270355, + "rewards/tag_count_reward/mean": 0.89892578125, + "rewards/tag_count_reward/std": 0.23442442715168, "step": 441 }, { @@ -12804,27 +12804,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1019.21484375, - "completions/mean_terminated_length": 957.4451293945312, - "completions/min_length": 101.0, - "completions/min_terminated_length": 101.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1038.359375, + "completions/mean_terminated_length": 904.3363037109375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, "epoch": 0.15089186651873346, - "grad_norm": 0.7562029361724854, - "kl": 0.166748046875, - "learning_rate": 9.929287881822545e-07, - "loss": 0.064, - "num_tokens": 298306657.0, - "reward": 1.19970703125, - "reward_std": 0.2773776650428772, - "rewards/accuracy_reward/mean": 0.25604838132858276, - "rewards/accuracy_reward/std": 0.43688949942588806, + "grad_norm": 0.13173320889472961, + "kl": 0.03228759765625, + "learning_rate": 9.930178592705929e-07, + "loss": 0.104, + "num_tokens": 323038116.0, + "reward": 1.11572265625, + "reward_std": 0.31469881534576416, + "rewards/accuracy_reward/mean": 0.20766128599643707, + "rewards/accuracy_reward/std": 0.4060424566268921, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95166015625, - "rewards/tag_count_reward/std": 0.1539822816848755, + "rewards/tag_count_reward/mean": 0.91455078125, + "rewards/tag_count_reward/std": 0.2170523852109909, "step": 442 }, { @@ -12833,27 +12833,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 1037.333984375, - "completions/mean_terminated_length": 967.7056884765625, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 997.322265625, + "completions/mean_terminated_length": 898.5406494140625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.15123325083212427, - "grad_norm": 8.234630584716797, - "kl": 0.207763671875, - "learning_rate": 9.92833807647177e-07, - "loss": 0.0942, - "num_tokens": 298920076.0, - "reward": 1.068359375, - "reward_std": 0.2526884377002716, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, + "grad_norm": 0.15234129130840302, + "kl": 0.03607177734375, + "learning_rate": 9.92923436165754e-07, + "loss": 0.1051, + "num_tokens": 323631049.0, + "reward": 1.06689453125, + "reward_std": 0.2941228747367859, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.16007427871227264, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.20394715666770935, "step": 443 }, { @@ -12862,27 +12862,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0859375, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 1056.375, - "completions/mean_terminated_length": 963.1453247070312, - "completions/min_length": 211.0, - "completions/min_terminated_length": 211.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1066.47265625, + "completions/mean_terminated_length": 938.6357421875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, "epoch": 0.15157463514551506, - "grad_norm": 1.3715955018997192, - "kl": 0.132080078125, - "learning_rate": 9.927381985905051e-07, - "loss": 0.0946, - "num_tokens": 299539148.0, - "reward": 1.06494140625, - "reward_std": 0.27978986501693726, - "rewards/accuracy_reward/mean": 0.13671875, - "rewards/accuracy_reward/std": 0.3438861668109894, + "grad_norm": 0.16123086214065552, + "kl": 0.03369140625, + "learning_rate": 9.928283839350469e-07, + "loss": 0.0977, + "num_tokens": 324255291.0, + "reward": 1.05224609375, + "reward_std": 0.2948440611362457, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.18266178667545319, + "rewards/tag_count_reward/mean": 0.90185546875, + "rewards/tag_count_reward/std": 0.22231541574001312, "step": 444 }, { @@ -12891,27 +12891,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 1059.0078125, - "completions/mean_terminated_length": 990.8726806640625, - "completions/min_length": 229.0, - "completions/min_terminated_length": 229.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1054.919921875, + "completions/mean_terminated_length": 942.6586303710938, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, "epoch": 0.15191601945890587, - "grad_norm": 0.4454004466533661, - "kl": 0.131103515625, - "learning_rate": 9.92641961147938e-07, - "loss": 0.0505, - "num_tokens": 300158368.0, - "reward": 1.04443359375, - "reward_std": 0.23585447669029236, + "grad_norm": 0.1454250067472458, + "kl": 0.03521728515625, + "learning_rate": 9.927327027134833e-07, + "loss": 0.1261, + "num_tokens": 324872418.0, + "reward": 1.009765625, + "reward_std": 0.2698069214820862, "rewards/accuracy_reward/mean": 0.0947580635547638, "rewards/accuracy_reward/std": 0.29317617416381836, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95263671875, - "rewards/tag_count_reward/std": 0.13207614421844482, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.21412767469882965, "step": 445 }, { @@ -12920,27 +12920,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.126953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1133.0625, - "completions/mean_terminated_length": 1074.0955810546875, - "completions/min_length": 232.0, - "completions/min_terminated_length": 232.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1110.09375, + "completions/mean_terminated_length": 973.7091674804688, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, "epoch": 0.15225740377229666, - "grad_norm": 0.8461600542068481, - "kl": 0.1156005859375, - "learning_rate": 9.925450954560676e-07, - "loss": 0.0838, - "num_tokens": 300813520.0, - "reward": 1.0185546875, - "reward_std": 0.21500445902347565, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, + "grad_norm": 0.14470691978931427, + "kl": 0.0357666015625, + "learning_rate": 9.926363926369685e-07, + "loss": 0.1634, + "num_tokens": 325515810.0, + "reward": 0.9638671875, + "reward_std": 0.27993327379226685, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9462890625, - "rewards/tag_count_reward/std": 0.16267666220664978, + "rewards/tag_count_reward/mean": 0.8955078125, + "rewards/tag_count_reward/std": 0.23888057470321655, "step": 446 }, { @@ -12949,27 +12949,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.07421875, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 1076.810546875, - "completions/mean_terminated_length": 998.951416015625, - "completions/min_length": 232.0, - "completions/min_terminated_length": 232.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1049.02734375, + "completions/mean_terminated_length": 959.7574462890625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.15259878808568747, - "grad_norm": 0.7099547386169434, - "kl": 0.14013671875, - "learning_rate": 9.924476016523765e-07, - "loss": 0.1158, - "num_tokens": 301436703.0, - "reward": 1.01171875, - "reward_std": 0.18262024223804474, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, + "grad_norm": 0.1457584649324417, + "kl": 0.03619384765625, + "learning_rate": 9.925394538423005e-07, + "loss": 0.1422, + "num_tokens": 326124768.0, + "reward": 0.9912109375, + "reward_std": 0.26201674342155457, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.953125, - "rewards/tag_count_reward/std": 0.1467188447713852, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.20414677262306213, "step": 447 }, { @@ -12978,27 +12978,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0859375, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1066.75, - "completions/mean_terminated_length": 974.4957885742188, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1060.12109375, + "completions/mean_terminated_length": 941.2297973632812, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.15294017239907826, - "grad_norm": 0.7692645788192749, - "kl": 0.123779296875, - "learning_rate": 9.9234947987524e-07, - "loss": 0.0775, - "num_tokens": 302060671.0, - "reward": 1.072265625, - "reward_std": 0.24299228191375732, - "rewards/accuracy_reward/mean": 0.14314515888690948, - "rewards/accuracy_reward/std": 0.35057440400123596, + "grad_norm": 0.8526865839958191, + "kl": 0.0455322265625, + "learning_rate": 9.924418864671708e-07, + "loss": 0.1173, + "num_tokens": 326745342.0, + "reward": 1.02490234375, + "reward_std": 0.2669713497161865, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310528099536896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.1748199313879013, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.21874071657657623, "step": 448 }, { @@ -13007,27 +13007,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.06640625, + "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1084.212890625, - "completions/mean_terminated_length": 1015.6589965820312, - "completions/min_length": 249.0, - "completions/min_terminated_length": 249.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1086.126953125, + "completions/mean_terminated_length": 986.6228637695312, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.15328155671246907, - "grad_norm": 1.9816856384277344, - "kl": 0.161865234375, - "learning_rate": 9.922507302639234e-07, - "loss": 0.1198, - "num_tokens": 302686556.0, - "reward": 1.02685546875, - "reward_std": 0.18609923124313354, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, + "grad_norm": 1444939.625, + "kl": 5056.0260009765625, + "learning_rate": 9.923436906501635e-07, + "loss": 202.9986, + "num_tokens": 327372207.0, + "reward": 1.01171875, + "reward_std": 0.25588080286979675, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95458984375, - "rewards/tag_count_reward/std": 0.14508728682994843, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.20715993642807007, "step": 449 }, { @@ -13036,27 +13036,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 987.177734375, - "completions/mean_terminated_length": 932.7207641601562, - "completions/min_length": 215.0, - "completions/min_terminated_length": 215.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 989.2109375, + "completions/mean_terminated_length": 887.186279296875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, "epoch": 0.15362294102585985, - "grad_norm": 0.7764598727226257, - "kl": 0.139892578125, - "learning_rate": 9.921513529585842e-07, - "loss": 0.0547, - "num_tokens": 303266807.0, - "reward": 1.15380859375, - "reward_std": 0.21334266662597656, - "rewards/accuracy_reward/mean": 0.193359375, - "rewards/accuracy_reward/std": 0.39531853795051575, + "grad_norm": 0.15711206197738647, + "kl": 0.0362548828125, + "learning_rate": 9.922448665307552e-07, + "loss": 0.1206, + "num_tokens": 327953499.0, + "reward": 1.1181640625, + "reward_std": 0.26618778705596924, + "rewards/accuracy_reward/mean": 0.201171875, + "rewards/accuracy_reward/std": 0.4012683033943176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.96044921875, - "rewards/tag_count_reward/std": 0.12810981273651123, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.20676980912685394, "step": 450 }, { @@ -13065,27 +13065,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1081.927734375, - "completions/mean_terminated_length": 1019.665283203125, - "completions/min_length": 231.0, - "completions/min_terminated_length": 231.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 983.72265625, + "completions/mean_terminated_length": 919.8219604492188, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.15396432533925067, - "grad_norm": 43.58025360107422, - "kl": 1.106201171875, - "learning_rate": 9.920513481002698e-07, - "loss": 0.1273, - "num_tokens": 303902242.0, - "reward": 1.0888671875, - "reward_std": 0.24603061378002167, + "grad_norm": 0.13837960362434387, + "kl": 0.03631591796875, + "learning_rate": 9.921454142493155e-07, + "loss": 0.08, + "num_tokens": 328538653.0, + "reward": 1.0830078125, + "reward_std": 0.26221001148223877, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.9482421875, - "rewards/tag_count_reward/std": 0.16330981254577637, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.16867490112781525, "step": 451 }, { @@ -13094,27 +13094,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.06640625, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1040.08984375, - "completions/mean_terminated_length": 968.3974609375, - "completions/min_length": 81.0, - "completions/min_terminated_length": 81.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 984.099609375, + "completions/mean_terminated_length": 898.8079833984375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, "epoch": 0.15430570965264145, - "grad_norm": 225.49293518066406, - "kl": 2.928466796875, - "learning_rate": 9.919507158309192e-07, - "loss": 0.2038, - "num_tokens": 304511056.0, - "reward": 1.0263671875, - "reward_std": 0.20071804523468018, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, + "grad_norm": 0.15354254841804504, + "kl": 0.03790283203125, + "learning_rate": 9.920453339471052e-07, + "loss": 0.1513, + "num_tokens": 329118800.0, + "reward": 0.99267578125, + "reward_std": 0.22924144566059113, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9521484375, - "rewards/tag_count_reward/std": 0.14888782799243927, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.19423779845237732, "step": 452 }, { @@ -13123,27 +13123,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 990.08984375, - "completions/mean_terminated_length": 949.3184204101562, - "completions/min_length": 218.0, - "completions/min_terminated_length": 218.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 936.76953125, + "completions/mean_terminated_length": 852.7269287109375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.15464709396603227, - "grad_norm": 1.5338170528411865, - "kl": 0.320556640625, - "learning_rate": 9.918494562933614e-07, - "loss": 0.1019, - "num_tokens": 305103246.0, - "reward": 1.13720703125, - "reward_std": 0.18577462434768677, - "rewards/accuracy_reward/mean": 0.173828125, - "rewards/accuracy_reward/std": 0.3793322443962097, + "grad_norm": 0.15643727779388428, + "kl": 0.04217529296875, + "learning_rate": 9.919446257662785e-07, + "loss": 0.1308, + "num_tokens": 329683690.0, + "reward": 1.091796875, + "reward_std": 0.2412022352218628, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.96337890625, - "rewards/tag_count_reward/std": 0.12216134369373322, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.19736173748970032, "step": 453 }, { @@ -13152,27 +13152,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 987.21484375, - "completions/mean_terminated_length": 950.7838745117188, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 975.494140625, + "completions/mean_terminated_length": 887.0634155273438, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, "epoch": 0.15498847827942305, - "grad_norm": 4.054177284240723, - "kl": 0.359375, - "learning_rate": 9.917475696313157e-07, - "loss": 0.0613, - "num_tokens": 305682364.0, - "reward": 1.0966796875, - "reward_std": 0.21819844841957092, - "rewards/accuracy_reward/mean": 0.1391129046678543, - "rewards/accuracy_reward/std": 0.3464137017726898, + "grad_norm": 0.14415857195854187, + "kl": 0.03485107421875, + "learning_rate": 9.918432898498802e-07, + "loss": 0.1004, + "num_tokens": 330256807.0, + "reward": 1.095703125, + "reward_std": 0.28144875168800354, + "rewards/accuracy_reward/mean": 0.16733871400356293, + "rewards/accuracy_reward/std": 0.37365487217903137, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9619140625, - "rewards/tag_count_reward/std": 0.11499445885419846, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.1869896799325943, "step": 454 }, { @@ -13181,27 +13181,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.111328125, + "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 1219.708984375, - "completions/mean_terminated_length": 1115.945068359375, - "completions/min_length": 255.0, - "completions/min_terminated_length": 255.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1161.822265625, + "completions/mean_terminated_length": 1016.8113403320312, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.15532986259281387, - "grad_norm": 842.3438720703125, - "kl": 11.1953125, - "learning_rate": 9.916450559893917e-07, - "loss": 0.55, - "num_tokens": 306388247.0, - "reward": 0.98046875, - "reward_std": 0.2435244619846344, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, + "grad_norm": 0.1531001180410385, + "kl": 0.03802490234375, + "learning_rate": 9.917413263418474e-07, + "loss": 0.146, + "num_tokens": 330933052.0, + "reward": 0.99560546875, + "reward_std": 0.321479469537735, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.1932915896177292, + "rewards/tag_count_reward/mean": 0.89208984375, + "rewards/tag_count_reward/std": 0.24017061293125153, "step": 455 }, { @@ -13210,27 +13210,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 1036.408203125, - "completions/mean_terminated_length": 986.65771484375, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1013.353515625, + "completions/mean_terminated_length": 953.4978637695312, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.15567124690620465, - "grad_norm": 1.067717432975769, - "kl": 0.24267578125, - "learning_rate": 9.915419155130886e-07, - "loss": 0.0952, - "num_tokens": 307004200.0, - "reward": 1.08251953125, - "reward_std": 0.23473551869392395, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, + "grad_norm": 0.1461944282054901, + "kl": 0.03753662109375, + "learning_rate": 9.916387353870085e-07, + "loss": 0.1226, + "num_tokens": 331537201.0, + "reward": 1.0615234375, + "reward_std": 0.2562964856624603, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94970703125, - "rewards/tag_count_reward/std": 0.1549411565065384, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.18260227143764496, "step": 456 }, { @@ -13239,27 +13239,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 987.5859375, - "completions/mean_terminated_length": 951.167724609375, - "completions/min_length": 207.0, - "completions/min_terminated_length": 207.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 980.486328125, + "completions/mean_terminated_length": 897.3325805664062, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.15601263121959547, - "grad_norm": 73.63131713867188, - "kl": 1.642578125, - "learning_rate": 9.914381483487957e-07, - "loss": 0.1524, - "num_tokens": 307580292.0, - "reward": 1.0546875, - "reward_std": 0.19174796342849731, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, + "grad_norm": 0.1541946977376938, + "kl": 0.0416259765625, + "learning_rate": 9.91535517131083e-07, + "loss": 0.1247, + "num_tokens": 332109658.0, + "reward": 1.00927734375, + "reward_std": 0.19664829969406128, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.962890625, - "rewards/tag_count_reward/std": 0.12740769982337952, + "rewards/tag_count_reward/mean": 0.94091796875, + "rewards/tag_count_reward/std": 0.18319980800151825, "step": 457 }, { @@ -13268,27 +13268,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1013.505859375, - "completions/mean_terminated_length": 962.6290283203125, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 972.650390625, + "completions/mean_terminated_length": 888.8862915039062, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.15635401553298625, - "grad_norm": 0.8415966033935547, - "kl": 0.1641845703125, - "learning_rate": 9.913337546437912e-07, - "loss": 0.0852, - "num_tokens": 308176887.0, - "reward": 1.091796875, - "reward_std": 0.23131389915943146, - "rewards/accuracy_reward/mean": 0.142578125, - "rewards/accuracy_reward/std": 0.3499840497970581, + "grad_norm": 0.18938900530338287, + "kl": 0.040191650390625, + "learning_rate": 9.914316717206816e-07, + "loss": 0.1348, + "num_tokens": 332685335.0, + "reward": 1.08447265625, + "reward_std": 0.28646454215049744, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94921875, - "rewards/tag_count_reward/std": 0.15118376910686493, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.19072304666042328, "step": 458 }, { @@ -13297,27 +13297,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.080078125, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 1108.814453125, - "completions/mean_terminated_length": 1027.0594482421875, - "completions/min_length": 55.0, - "completions/min_terminated_length": 55.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1084.564453125, + "completions/mean_terminated_length": 1005.1268310546875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.15669539984637706, - "grad_norm": 1.0721087455749512, - "kl": 0.175048828125, - "learning_rate": 9.91228734546243e-07, - "loss": 0.0635, - "num_tokens": 308819912.0, - "reward": 1.13818359375, - "reward_std": 0.25097551941871643, - "rewards/accuracy_reward/mean": 0.1953125, - "rewards/accuracy_reward/std": 0.3968288004398346, + "grad_norm": 0.1524220108985901, + "kl": 0.034881591796875, + "learning_rate": 9.913271993033058e-07, + "loss": 0.1326, + "num_tokens": 333315944.0, + "reward": 1.17041015625, + "reward_std": 0.323802649974823, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42402184009552, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94287109375, - "rewards/tag_count_reward/std": 0.1663554310798645, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.1828918159008026, "step": 459 }, { @@ -13326,27 +13326,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.072265625, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 1101.990234375, - "completions/mean_terminated_length": 1028.301025390625, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1037.828125, + "completions/mean_terminated_length": 949.8939208984375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.15703678415976785, - "grad_norm": 1.2053905725479126, - "kl": 0.21875, - "learning_rate": 9.911230882052082e-07, - "loss": 0.095, - "num_tokens": 309464019.0, - "reward": 1.00048828125, - "reward_std": 0.21932029724121094, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 0.21820634603500366, + "kl": 0.03631591796875, + "learning_rate": 9.912221000273474e-07, + "loss": 0.1279, + "num_tokens": 333927200.0, + "reward": 1.00537109375, + "reward_std": 0.26538532972335815, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.17743425071239471, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.1999441683292389, "step": 460 }, { @@ -13355,27 +13355,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 1110.123046875, - "completions/mean_terminated_length": 1045.5093994140625, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1068.623046875, + "completions/mean_terminated_length": 967.3081665039062, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.15737816847315866, - "grad_norm": 4.203698635101318, - "kl": 0.2410888671875, - "learning_rate": 9.91016815770632e-07, - "loss": 0.1079, - "num_tokens": 310110258.0, - "reward": 1.044921875, - "reward_std": 0.23291827738285065, + "grad_norm": 6.625924110412598, + "kl": 0.08343505859375, + "learning_rate": 9.91116374042089e-07, + "loss": 0.1406, + "num_tokens": 334552191.0, + "reward": 1.02392578125, + "reward_std": 0.26444312930107117, "rewards/accuracy_reward/mean": 0.10080645233392715, "rewards/accuracy_reward/std": 0.30137622356414795, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.947265625, - "rewards/tag_count_reward/std": 0.15293073654174805, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.19859671592712402, "step": 461 }, { @@ -13384,27 +13384,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1089.384765625, - "completions/mean_terminated_length": 1025.4771728515625, - "completions/min_length": 267.0, - "completions/min_terminated_length": 267.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1063.3828125, + "completions/mean_terminated_length": 977.673095703125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, "epoch": 0.15771955278654945, - "grad_norm": 1.4528241157531738, - "kl": 0.2091064453125, - "learning_rate": 9.90909917393349e-07, - "loss": 0.1112, - "num_tokens": 310744663.0, - "reward": 1.07275390625, - "reward_std": 0.21886608004570007, + "grad_norm": 0.23513740301132202, + "kl": 0.0377197265625, + "learning_rate": 9.910100214977032e-07, + "loss": 0.1174, + "num_tokens": 335173283.0, + "reward": 1.0546875, + "reward_std": 0.2536543309688568, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94970703125, - "rewards/tag_count_reward/std": 0.1509426087141037, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.18889120221138, "step": 462 }, { @@ -13413,27 +13413,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1066.322265625, - "completions/mean_terminated_length": 1011.6721801757812, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1059.017578125, + "completions/mean_terminated_length": 947.2195434570312, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.15806093709994026, - "grad_norm": 1.0756810903549194, - "kl": 0.197265625, - "learning_rate": 9.908023932250816e-07, - "loss": 0.0782, - "num_tokens": 311379004.0, - "reward": 1.07568359375, - "reward_std": 0.21483656764030457, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, + "grad_norm": 0.1616012156009674, + "kl": 0.03857421875, + "learning_rate": 9.90903042545252e-07, + "loss": 0.1206, + "num_tokens": 335803884.0, + "reward": 1.05126953125, + "reward_std": 0.2340162992477417, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94873046875, - "rewards/tag_count_reward/std": 0.1481568068265915, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.19982466101646423, "step": 463 }, { @@ -13442,27 +13442,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 1004.3046875, - "completions/mean_terminated_length": 955.2147216796875, - "completions/min_length": 212.0, - "completions/min_terminated_length": 212.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1062.298828125, + "completions/mean_terminated_length": 962.6688232421875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.15840232141333105, - "grad_norm": 0.9950715899467468, - "kl": 0.27783203125, - "learning_rate": 9.906942434184411e-07, - "loss": 0.0879, - "num_tokens": 311966824.0, - "reward": 1.06884765625, - "reward_std": 0.2228804975748062, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, + "grad_norm": 0.15010185539722443, + "kl": 0.0364990234375, + "learning_rate": 9.907954373366884e-07, + "loss": 0.1089, + "num_tokens": 336421397.0, + "reward": 1.041015625, + "reward_std": 0.2394903600215912, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95361328125, - "rewards/tag_count_reward/std": 0.14976051449775696, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.19257840514183044, "step": 464 }, { @@ -13471,27 +13471,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 1064.376953125, - "completions/mean_terminated_length": 1030.595947265625, - "completions/min_length": 206.0, - "completions/min_terminated_length": 206.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1107.087890625, + "completions/mean_terminated_length": 991.5372924804688, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.15874370572672186, - "grad_norm": 47.52520751953125, - "kl": 1.08154296875, - "learning_rate": 9.90585468126926e-07, - "loss": 0.1194, - "num_tokens": 312585881.0, - "reward": 1.09619140625, - "reward_std": 0.19244486093521118, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, + "grad_norm": 0.18746060132980347, + "kl": 0.03753662109375, + "learning_rate": 9.90687206024854e-07, + "loss": 0.1577, + "num_tokens": 337062322.0, + "reward": 1.03515625, + "reward_std": 0.26289528608322144, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.96337890625, - "rewards/tag_count_reward/std": 0.1211559996008873, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.2219373881816864, "step": 465 }, { @@ -13500,27 +13500,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.111328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 1057.943359375, - "completions/mean_terminated_length": 1023.9414672851562, - "completions/min_length": 266.0, - "completions/min_terminated_length": 266.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1165.6953125, + "completions/mean_terminated_length": 1055.1649169921875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.15908509004011265, - "grad_norm": 2.6732664108276367, - "kl": 0.335205078125, - "learning_rate": 9.904760675049233e-07, - "loss": 0.0679, - "num_tokens": 313203164.0, - "reward": 1.05126953125, - "reward_std": 0.18792620301246643, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, + "grad_norm": 0.14396274089813232, + "kl": 0.03863525390625, + "learning_rate": 9.905783487634796e-07, + "loss": 0.0726, + "num_tokens": 337734774.0, + "reward": 0.99462890625, + "reward_std": 0.24153770506381989, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.96923828125, - "rewards/tag_count_reward/std": 0.10906112939119339, + "rewards/tag_count_reward/mean": 0.91455078125, + "rewards/tag_count_reward/std": 0.20959889888763428, "step": 466 }, { @@ -13529,27 +13529,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.1796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1119.1484375, - "completions/mean_terminated_length": 1059.2847900390625, - "completions/min_length": 89.0, - "completions/min_terminated_length": 89.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1173.458984375, + "completions/mean_terminated_length": 981.8928833007812, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, "epoch": 0.15942647435350346, - "grad_norm": 3.407625436782837, - "kl": 0.3837890625, - "learning_rate": 9.903660417077069e-07, - "loss": 0.0881, - "num_tokens": 313853624.0, - "reward": 1.037109375, - "reward_std": 0.19400563836097717, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, + "grad_norm": 0.21711322665214539, + "kl": 0.03863525390625, + "learning_rate": 9.904688657071858e-07, + "loss": 0.1423, + "num_tokens": 338413041.0, + "reward": 0.95849609375, + "reward_std": 0.30655211210250854, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95703125, - "rewards/tag_count_reward/std": 0.12748268246650696, + "rewards/tag_count_reward/mean": 0.86083984375, + "rewards/tag_count_reward/std": 0.2622622549533844, "step": 467 }, { @@ -13558,27 +13558,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 991.5078125, - "completions/mean_terminated_length": 966.1520385742188, - "completions/min_length": 276.0, - "completions/min_terminated_length": 276.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1133.345703125, + "completions/mean_terminated_length": 1032.1583251953125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, "epoch": 0.15976785866689425, - "grad_norm": 0.8257997632026672, - "kl": 0.239990234375, - "learning_rate": 9.90255390891438e-07, - "loss": 0.0711, - "num_tokens": 314431452.0, - "reward": 1.017578125, - "reward_std": 0.14489471912384033, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, + "grad_norm": 0.1276520937681198, + "kl": 0.034698486328125, + "learning_rate": 9.903587570114814e-07, + "loss": 0.1449, + "num_tokens": 339063490.0, + "reward": 0.98974609375, + "reward_std": 0.24342095851898193, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.97265625, - "rewards/tag_count_reward/std": 0.11162548512220383, + "rewards/tag_count_reward/mean": 0.91162109375, + "rewards/tag_count_reward/std": 0.21643958985805511, "step": 468 }, { @@ -13587,27 +13587,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 1054.896484375, - "completions/mean_terminated_length": 1020.7899169921875, - "completions/min_length": 55.0, - "completions/min_terminated_length": 55.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1103.58984375, + "completions/mean_terminated_length": 999.1106567382812, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.16010924298028506, - "grad_norm": 0.830088198184967, - "kl": 0.285888671875, - "learning_rate": 9.90144115213166e-07, - "loss": 0.0645, - "num_tokens": 315046679.0, - "reward": 1.033203125, - "reward_std": 0.1966622769832611, - "rewards/accuracy_reward/mean": 0.07661290466785431, - "rewards/accuracy_reward/std": 0.2662447690963745, + "grad_norm": 0.2375566065311432, + "kl": 0.03875732421875, + "learning_rate": 9.902480228327645e-07, + "loss": 0.1268, + "num_tokens": 339703648.0, + "reward": 1.0361328125, + "reward_std": 0.24838443100452423, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310528099536896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.958984375, - "rewards/tag_count_reward/std": 0.1337306946516037, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.21008896827697754, "step": 469 }, { @@ -13616,27 +13616,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1020.796875, - "completions/mean_terminated_length": 979.0405883789062, - "completions/min_length": 228.0, - "completions/min_terminated_length": 228.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1122.00390625, + "completions/mean_terminated_length": 1001.3995361328125, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, "epoch": 0.16045062729367585, - "grad_norm": 28.442277908325195, - "kl": 0.573486328125, - "learning_rate": 9.900322148308256e-07, - "loss": 0.1236, - "num_tokens": 315650047.0, - "reward": 1.0419921875, - "reward_std": 0.19466054439544678, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, + "grad_norm": 0.30124250054359436, + "kl": 0.04132080078125, + "learning_rate": 9.90136663328321e-07, + "loss": 0.1767, + "num_tokens": 340358834.0, + "reward": 1.01123046875, + "reward_std": 0.30274468660354614, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9619140625, - "rewards/tag_count_reward/std": 0.12615279853343964, + "rewards/tag_count_reward/mean": 0.90185546875, + "rewards/tag_count_reward/std": 0.2335832417011261, "step": 470 }, { @@ -13645,27 +13645,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1060.900390625, - "completions/mean_terminated_length": 1022.8579711914062, - "completions/min_length": 265.0, - "completions/min_terminated_length": 265.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1141.6796875, + "completions/mean_terminated_length": 1021.3717041015625, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, "epoch": 0.16079201160706666, - "grad_norm": 2.216592311859131, - "kl": 0.27783203125, - "learning_rate": 9.899196899032393e-07, - "loss": 0.093, - "num_tokens": 316262812.0, - "reward": 1.0986328125, - "reward_std": 0.253503680229187, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, + "grad_norm": 0.3650186061859131, + "kl": 0.04541015625, + "learning_rate": 9.900246786563254e-07, + "loss": 0.1238, + "num_tokens": 341012958.0, + "reward": 1.048828125, + "reward_std": 0.3344082236289978, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9658203125, - "rewards/tag_count_reward/std": 0.1203538104891777, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.2244802713394165, "step": 471 }, { @@ -13674,27 +13674,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.095703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1004.7265625, - "completions/mean_terminated_length": 975.3975830078125, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1056.5078125, + "completions/mean_terminated_length": 951.57666015625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.16113339592045745, - "grad_norm": 34.538063049316406, - "kl": 0.72900390625, - "learning_rate": 9.898065405901156e-07, - "loss": 0.123, - "num_tokens": 316842944.0, - "reward": 1.0537109375, - "reward_std": 0.18075445294380188, + "grad_norm": 262.1431884765625, + "kl": 1.83160400390625, + "learning_rate": 9.8991206897584e-07, + "loss": 0.1907, + "num_tokens": 341619602.0, + "reward": 1.001953125, + "reward_std": 0.2624667286872864, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9716796875, - "rewards/tag_count_reward/std": 0.10458524525165558, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.20555779337882996, "step": 472 }, { @@ -13703,27 +13703,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.095703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 1024.359375, - "completions/mean_terminated_length": 984.90869140625, - "completions/min_length": 224.0, - "completions/min_terminated_length": 224.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1073.13671875, + "completions/mean_terminated_length": 969.9653930664062, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, "epoch": 0.16147478023384826, - "grad_norm": 0.7149609923362732, - "kl": 0.164306640625, - "learning_rate": 9.896927670520495e-07, - "loss": 0.0898, - "num_tokens": 317446280.0, - "reward": 1.134765625, - "reward_std": 0.22232946753501892, - "rewards/accuracy_reward/mean": 0.166015625, - "rewards/accuracy_reward/std": 0.3724585771560669, + "grad_norm": 7812.05810546875, + "kl": 123.5379638671875, + "learning_rate": 9.897988344468148e-07, + "loss": 5.0653, + "num_tokens": 342247912.0, + "reward": 1.0830078125, + "reward_std": 0.27097246050834656, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.96875, - "rewards/tag_count_reward/std": 0.11169394850730896, + "rewards/tag_count_reward/mean": 0.9189453125, + "rewards/tag_count_reward/std": 0.20277541875839233, "step": 473 }, { @@ -13732,27 +13732,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.103515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 973.15625, - "completions/mean_terminated_length": 949.556884765625, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1068.548828125, + "completions/mean_terminated_length": 955.453125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, "epoch": 0.16181616454723904, - "grad_norm": 67.18488311767578, - "kl": 1.100830078125, - "learning_rate": 9.895783694505212e-07, - "loss": 0.0751, - "num_tokens": 318013240.0, - "reward": 1.09521484375, - "reward_std": 0.17843782901763916, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, + "grad_norm": 0.20627427101135254, + "kl": 0.0450439453125, + "learning_rate": 9.89684975230088e-07, + "loss": 0.1183, + "num_tokens": 342863713.0, + "reward": 1.06103515625, + "reward_std": 0.27761712670326233, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.97021484375, - "rewards/tag_count_reward/std": 0.11154734343290329, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.21285150945186615, "step": 474 }, { @@ -13761,27 +13761,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.103515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1131.65625, - "completions/mean_terminated_length": 1117.1112060546875, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1164.78515625, + "completions/mean_terminated_length": 1062.8017578125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, "epoch": 0.16215754886062986, - "grad_norm": 0.2888360023498535, - "kl": 0.1614990234375, - "learning_rate": 9.894633479478974e-07, - "loss": 0.0367, - "num_tokens": 318669240.0, - "reward": 1.05419921875, - "reward_std": 0.1976710706949234, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, + "grad_norm": 0.32532963156700134, + "kl": 0.04119873046875, + "learning_rate": 9.895704914873838e-07, + "loss": 0.1244, + "num_tokens": 343536675.0, + "reward": 0.98291015625, + "reward_std": 0.2782757878303528, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.96826171875, - "rewards/tag_count_reward/std": 0.11426402628421783, + "rewards/tag_count_reward/mean": 0.89501953125, + "rewards/tag_count_reward/std": 0.23058590292930603, "step": 475 }, { @@ -13790,27 +13790,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 994.228515625, - "completions/mean_terminated_length": 935.5650024414062, - "completions/min_length": 8.0, - "completions/min_terminated_length": 8.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1020.779296875, + "completions/mean_terminated_length": 933.7266845703125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.16249893317402064, - "grad_norm": 1.06907320022583, - "kl": 0.17431640625, - "learning_rate": 9.893477027074303e-07, - "loss": 0.0649, - "num_tokens": 319255485.0, - "reward": 1.13916015625, - "reward_std": 0.22496505081653595, - "rewards/accuracy_reward/mean": 0.17943547666072845, - "rewards/accuracy_reward/std": 0.3841039538383484, + "grad_norm": 0.29765912890434265, + "kl": 0.03851318359375, + "learning_rate": 9.89455383381315e-07, + "loss": 0.1446, + "num_tokens": 344136514.0, + "reward": 1.1279296875, + "reward_std": 0.2812463343143463, + "rewards/accuracy_reward/mean": 0.2036290317773819, + "rewards/accuracy_reward/std": 0.4031028151512146, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.96533203125, - "rewards/tag_count_reward/std": 0.1197040006518364, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.1930219829082489, "step": 476 }, { @@ -13819,27 +13819,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.16015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1008.46484375, - "completions/mean_terminated_length": 968.401611328125, - "completions/min_length": 217.0, - "completions/min_terminated_length": 217.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1106.541015625, + "completions/mean_terminated_length": 927.0069580078125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.16284031748741146, - "grad_norm": 0.6685115098953247, - "kl": 0.1739501953125, - "learning_rate": 9.89231433893257e-07, - "loss": 0.067, - "num_tokens": 319849163.0, - "reward": 1.0595703125, - "reward_std": 0.2082090675830841, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, + "grad_norm": 0.33845609426498413, + "kl": 0.0445556640625, + "learning_rate": 9.893396510753802e-07, + "loss": 0.1348, + "num_tokens": 344780407.0, + "reward": 1.00244140625, + "reward_std": 0.3071286678314209, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9638671875, - "rewards/tag_count_reward/std": 0.12280593812465668, + "rewards/tag_count_reward/mean": 0.88330078125, + "rewards/tag_count_reward/std": 0.24961701035499573, "step": 477 }, { @@ -13848,27 +13848,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 1101.3984375, - "completions/mean_terminated_length": 1058.89794921875, - "completions/min_length": 244.0, - "completions/min_terminated_length": 244.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1129.001953125, + "completions/mean_terminated_length": 1049.0042724609375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.16318170180080224, - "grad_norm": 1.3302628993988037, - "kl": 0.180908203125, - "learning_rate": 9.891145416703998e-07, - "loss": 0.0726, - "num_tokens": 320490119.0, - "reward": 1.05517578125, - "reward_std": 0.20020583271980286, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, + "grad_norm": 0.27333542704582214, + "kl": 0.0372314453125, + "learning_rate": 9.892232947339646e-07, + "loss": 0.0807, + "num_tokens": 345435496.0, + "reward": 1.0166015625, + "reward_std": 0.2632399797439575, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.96142578125, - "rewards/tag_count_reward/std": 0.12551778554916382, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.1806451380252838, "step": 478 }, { @@ -13877,27 +13877,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1059.849609375, - "completions/mean_terminated_length": 1013.3721923828125, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1125.5859375, + "completions/mean_terminated_length": 1023.5401611328125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.16352308611419306, - "grad_norm": 1.8521498441696167, - "kl": 0.22265625, - "learning_rate": 9.889970262047658e-07, - "loss": 0.0861, - "num_tokens": 321106218.0, - "reward": 0.98583984375, - "reward_std": 0.18413066864013672, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, + "grad_norm": 0.5575989484786987, + "kl": 0.05474853515625, + "learning_rate": 9.891063145223405e-07, + "loss": 0.1207, + "num_tokens": 346085252.0, + "reward": 0.9619140625, + "reward_std": 0.2514452338218689, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95263671875, - "rewards/tag_count_reward/std": 0.14103294909000397, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.2142503559589386, "step": 479 }, { @@ -13906,27 +13906,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1031.974609375, - "completions/mean_terminated_length": 1011.735107421875, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1052.884765625, + "completions/mean_terminated_length": 982.1024780273438, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.16386447042758384, - "grad_norm": 3.3183910846710205, - "kl": 0.25537109375, - "learning_rate": 9.888788876631467e-07, - "loss": 0.092, - "num_tokens": 321712525.0, - "reward": 1.126953125, - "reward_std": 0.24029642343521118, - "rewards/accuracy_reward/mean": 0.16015625, - "rewards/accuracy_reward/std": 0.3671095669269562, + "grad_norm": 243.59165954589844, + "kl": 1.30230712890625, + "learning_rate": 9.889887106066654e-07, + "loss": 0.1438, + "num_tokens": 346702265.0, + "reward": 1.0771484375, + "reward_std": 0.25558602809906006, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.966796875, - "rewards/tag_count_reward/std": 0.11960916966199875, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.17660093307495117, "step": 480 }, { @@ -13935,27 +13935,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 1030.52734375, - "completions/mean_terminated_length": 989.1666259765625, - "completions/min_length": 46.0, - "completions/min_terminated_length": 46.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1056.18359375, + "completions/mean_terminated_length": 962.9359741210938, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.16420585474097465, - "grad_norm": 310.8680725097656, - "kl": 8.68359375, - "learning_rate": 9.887601262132187e-07, - "loss": 0.4131, - "num_tokens": 322314619.0, - "reward": 1.0546875, - "reward_std": 0.2168104648590088, - "rewards/accuracy_reward/mean": 0.10080645233392715, - "rewards/accuracy_reward/std": 0.30137622356414795, + "grad_norm": 32.50343322753906, + "kl": 0.28216552734375, + "learning_rate": 9.888704831539838e-07, + "loss": 0.1373, + "num_tokens": 347317495.0, + "reward": 1.05810546875, + "reward_std": 0.24973270297050476, + "rewards/accuracy_reward/mean": 0.13508065044879913, + "rewards/accuracy_reward/std": 0.3421548008918762, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95703125, - "rewards/tag_count_reward/std": 0.13493992388248444, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.19460642337799072, "step": 481 }, { @@ -13964,27 +13964,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.126953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 1054.1328125, - "completions/mean_terminated_length": 1005.2540283203125, - "completions/min_length": 227.0, - "completions/min_terminated_length": 227.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1123.12890625, + "completions/mean_terminated_length": 988.6398315429688, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, "epoch": 0.16454723905436544, - "grad_norm": 31.778573989868164, - "kl": 1.4140625, - "learning_rate": 9.88640742023542e-07, - "loss": 0.1303, - "num_tokens": 322938319.0, - "reward": 1.03369140625, - "reward_std": 0.20622281730175018, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, + "grad_norm": 2.283095359802246, + "kl": 0.085693359375, + "learning_rate": 9.88751632332225e-07, + "loss": 0.0917, + "num_tokens": 347976521.0, + "reward": 0.953125, + "reward_std": 0.24625083804130554, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95166015625, - "rewards/tag_count_reward/std": 0.14665940403938293, + "rewards/tag_count_reward/mean": 0.896484375, + "rewards/tag_count_reward/std": 0.23204052448272705, "step": 482 }, { @@ -13993,27 +13993,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 1053.232421875, - "completions/mean_terminated_length": 1008.5693359375, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1105.572265625, + "completions/mean_terminated_length": 1021.3552856445312, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.16488862336775625, - "grad_norm": 2.6055119037628174, - "kl": 0.40234375, - "learning_rate": 9.8852073526356e-07, - "loss": 0.1049, - "num_tokens": 323550070.0, - "reward": 1.00537109375, - "reward_std": 0.19062183797359467, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, + "grad_norm": 0.1593397706747055, + "kl": 0.036376953125, + "learning_rate": 9.88632158310204e-07, + "loss": 0.0721, + "num_tokens": 348615070.0, + "reward": 1.0361328125, + "reward_std": 0.2503836154937744, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.16180720925331116, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.19661229848861694, "step": 483 }, { @@ -14022,27 +14022,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 956.6328125, - "completions/mean_terminated_length": 909.9552612304688, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1033.029296875, + "completions/mean_terminated_length": 956.266845703125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.16523000768114704, - "grad_norm": 303.0550842285156, - "kl": 10.84619140625, - "learning_rate": 9.884001061036013e-07, - "loss": 0.5225, - "num_tokens": 324120074.0, - "reward": 1.0517578125, - "reward_std": 0.21274060010910034, - "rewards/accuracy_reward/mean": 0.09879032522439957, - "rewards/accuracy_reward/std": 0.2986815273761749, + "grad_norm": 0.20737029612064362, + "kl": 0.0379638671875, + "learning_rate": 9.885120612576208e-07, + "loss": 0.0928, + "num_tokens": 349224189.0, + "reward": 1.0654296875, + "reward_std": 0.23803508281707764, + "rewards/accuracy_reward/mean": 0.13508065044879913, + "rewards/accuracy_reward/std": 0.3421548008918762, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9560546875, - "rewards/tag_count_reward/std": 0.15090225636959076, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.1892828494310379, "step": 484 }, { @@ -14051,27 +14051,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0859375, + "completions/clipped_ratio": 0.103515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 1072.443359375, - "completions/mean_terminated_length": 980.7244262695312, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1102.4375, + "completions/mean_terminated_length": 993.2548828125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.16557139199453785, - "grad_norm": 80.19758605957031, - "kl": 2.73095703125, - "learning_rate": 9.882788547148764e-07, - "loss": 0.2371, - "num_tokens": 324760365.0, - "reward": 1.09326171875, - "reward_std": 0.32172858715057373, - "rewards/accuracy_reward/mean": 0.1640625, - "rewards/accuracy_reward/std": 0.37069445848464966, + "grad_norm": 0.19747956097126007, + "kl": 0.03814697265625, + "learning_rate": 9.88391341345061e-07, + "loss": 0.0927, + "num_tokens": 349879837.0, + "reward": 1.109375, + "reward_std": 0.307794988155365, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.18896137177944183, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.2266491949558258, "step": 485 }, { @@ -14080,27 +14080,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 1032.154296875, - "completions/mean_terminated_length": 1001.4949340820312, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1173.375, + "completions/mean_terminated_length": 1039.4234619140625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.16591277630792864, - "grad_norm": 1.6599100828170776, - "kl": 0.298095703125, - "learning_rate": 9.881569812694795e-07, - "loss": 0.0698, - "num_tokens": 325370156.0, - "reward": 1.0576171875, - "reward_std": 0.20596779882907867, - "rewards/accuracy_reward/mean": 0.0947580635547638, - "rewards/accuracy_reward/std": 0.29317617416381836, + "grad_norm": 0.18783214688301086, + "kl": 0.041015625, + "learning_rate": 9.882699987439943e-07, + "loss": 0.1592, + "num_tokens": 350561933.0, + "reward": 1.00390625, + "reward_std": 0.299715131521225, + "rewards/accuracy_reward/mean": 0.10483870655298233, + "rewards/accuracy_reward/std": 0.30665475130081177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9658203125, - "rewards/tag_count_reward/std": 0.1203538104891777, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.22822889685630798, "step": 486 }, { @@ -14109,27 +14109,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 985.86328125, - "completions/mean_terminated_length": 949.3859252929688, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1107.548828125, + "completions/mean_terminated_length": 994.3654174804688, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.16625416062131945, - "grad_norm": 24.591007232666016, - "kl": 0.68408203125, - "learning_rate": 9.880344859403876e-07, - "loss": 0.1034, - "num_tokens": 325951638.0, - "reward": 1.06884765625, - "reward_std": 0.2342439591884613, - "rewards/accuracy_reward/mean": 0.11088709533214569, - "rewards/accuracy_reward/std": 0.3143092691898346, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.95947265625, - "rewards/tag_count_reward/std": 0.14315126836299896, + "grad_norm": 0.15802687406539917, + "kl": 0.03765869140625, + "learning_rate": 9.88148033626775e-07, + "loss": 0.093, + "num_tokens": 351205718.0, + "reward": 1.01513671875, + "reward_std": 0.26605281233787537, + "rewards/accuracy_reward/mean": 0.10685484111309052, + "rewards/accuracy_reward/std": 0.30924052000045776, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91162109375, + "rewards/tag_count_reward/std": 0.2181282937526703, "step": 487 }, { @@ -14138,27 +14138,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.130859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 1083.4609375, - "completions/mean_terminated_length": 1031.860107421875, - "completions/min_length": 81.0, - "completions/min_terminated_length": 81.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1203.0078125, + "completions/mean_terminated_length": 1075.7843017578125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, "epoch": 0.16659554493471024, - "grad_norm": 2.2207202911376953, - "kl": 0.44921875, - "learning_rate": 9.879113689014606e-07, - "loss": 0.0804, - "num_tokens": 326580738.0, - "reward": 1.046875, - "reward_std": 0.1927567422389984, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, + "grad_norm": 0.147533118724823, + "kl": 0.03314208984375, + "learning_rate": 9.880254461666415e-07, + "loss": 0.1096, + "num_tokens": 351896026.0, + "reward": 1.0126953125, + "reward_std": 0.24784868955612183, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9609375, - "rewards/tag_count_reward/std": 0.12778215110301971, + "rewards/tag_count_reward/mean": 0.9013671875, + "rewards/tag_count_reward/std": 0.21680405735969543, "step": 488 }, { @@ -14167,27 +14167,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 1048.501953125, - "completions/mean_terminated_length": 981.8687744140625, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1140.591796875, + "completions/mean_terminated_length": 1040.2060546875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.16693692924810105, - "grad_norm": 51.33022689819336, - "kl": 1.2646484375, - "learning_rate": 9.877876303274404e-07, - "loss": 0.1344, - "num_tokens": 327202627.0, - "reward": 1.0400390625, - "reward_std": 0.2218092828989029, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, + "grad_norm": 0.13074490427970886, + "kl": 0.035797119140625, + "learning_rate": 9.879022365377164e-07, + "loss": 0.1082, + "num_tokens": 352565065.0, + "reward": 1.05078125, + "reward_std": 0.2811756432056427, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9482421875, - "rewards/tag_count_reward/std": 0.1548524796962738, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.20478467643260956, "step": 489 }, { @@ -14196,27 +14196,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1969.0, - "completions/mean_length": 982.30078125, - "completions/mean_terminated_length": 961.0717163085938, - "completions/min_length": 256.0, - "completions/min_terminated_length": 256.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1169.712890625, + "completions/mean_terminated_length": 1053.1260986328125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, "epoch": 0.16727831356149184, - "grad_norm": 3.332066774368286, - "kl": 0.2041015625, - "learning_rate": 9.876632703939517e-07, - "loss": 0.0525, - "num_tokens": 327783885.0, - "reward": 1.080078125, - "reward_std": 0.22514007985591888, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, + "grad_norm": 0.14120085537433624, + "kl": 0.0350341796875, + "learning_rate": 9.877784049150062e-07, + "loss": 0.0626, + "num_tokens": 353242278.0, + "reward": 1.07958984375, + "reward_std": 0.3144848048686981, + "rewards/accuracy_reward/mean": 0.173828125, + "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.966796875, - "rewards/tag_count_reward/std": 0.11858218908309937, + "rewards/tag_count_reward/mean": 0.90576171875, + "rewards/tag_count_reward/std": 0.22070206701755524, "step": 490 }, { @@ -14225,27 +14225,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.142578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 959.12890625, - "completions/mean_terminated_length": 935.2215576171875, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1174.03515625, + "completions/mean_terminated_length": 1028.7061767578125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.16761969787488265, - "grad_norm": 3.5415234565734863, - "kl": 0.304443359375, - "learning_rate": 9.875382892775e-07, - "loss": 0.0913, - "num_tokens": 328349167.0, - "reward": 1.0859375, - "reward_std": 0.22196674346923828, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, + "grad_norm": 0.16105440258979797, + "kl": 0.0406494140625, + "learning_rate": 9.876539514744e-07, + "loss": 0.1167, + "num_tokens": 353917592.0, + "reward": 1.02587890625, + "reward_std": 0.30852940678596497, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.96875, - "rewards/tag_count_reward/std": 0.11386296153068542, + "rewards/tag_count_reward/mean": 0.88720703125, + "rewards/tag_count_reward/std": 0.24097691476345062, "step": 491 }, { @@ -14254,27 +14254,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 1010.57421875, - "completions/mean_terminated_length": 974.9454956054688, - "completions/min_length": 220.0, - "completions/min_terminated_length": 220.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1143.662109375, + "completions/mean_terminated_length": 1032.60302734375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.16796108218827344, - "grad_norm": 1.6188727617263794, - "kl": 0.315185546875, - "learning_rate": 9.874126871554738e-07, - "loss": 0.0675, - "num_tokens": 328941525.0, - "reward": 1.07421875, - "reward_std": 0.2316889762878418, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, + "grad_norm": 0.1420840173959732, + "kl": 0.0360107421875, + "learning_rate": 9.875288763926716e-07, + "loss": 0.1238, + "num_tokens": 354578091.0, + "reward": 1.10791015625, + "reward_std": 0.3026808202266693, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.14826077222824097, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.2224915623664856, "step": 492 }, { @@ -14283,27 +14283,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.119140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 1048.296875, - "completions/mean_terminated_length": 1026.3472900390625, - "completions/min_length": 57.0, - "completions/min_terminated_length": 57.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1208.732421875, + "completions/mean_terminated_length": 1095.21728515625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.16830246650166425, - "grad_norm": 2.26784086227417, - "kl": 0.2509765625, - "learning_rate": 9.872864642061419e-07, - "loss": 0.0641, - "num_tokens": 329553245.0, - "reward": 1.08056640625, - "reward_std": 0.23452627658843994, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, + "grad_norm": 0.14380201697349548, + "kl": 0.03558349609375, + "learning_rate": 9.87403179847476e-07, + "loss": 0.0951, + "num_tokens": 355271954.0, + "reward": 1.04345703125, + "reward_std": 0.2788853645324707, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95556640625, - "rewards/tag_count_reward/std": 0.13582131266593933, + "rewards/tag_count_reward/mean": 0.89306640625, + "rewards/tag_count_reward/std": 0.23021680116653442, "step": 493 }, { @@ -14312,27 +14312,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.095703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 864.888671875, - "completions/mean_terminated_length": 846.1091918945312, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1055.939453125, + "completions/mean_terminated_length": 950.9481201171875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.16864385081505504, - "grad_norm": 4.473270416259766, - "kl": 0.31396484375, - "learning_rate": 9.87159620608655e-07, - "loss": 0.0337, - "num_tokens": 330064740.0, - "reward": 1.04345703125, - "reward_std": 0.17220209538936615, - "rewards/accuracy_reward/mean": 0.07661290466785431, - "rewards/accuracy_reward/std": 0.2662447690963745, + "grad_norm": 0.1528129130601883, + "kl": 0.03900146484375, + "learning_rate": 9.872768620173523e-07, + "loss": 0.1204, + "num_tokens": 355881267.0, + "reward": 1.00341796875, + "reward_std": 0.2849913239479065, + "rewards/accuracy_reward/mean": 0.10685484111309052, + "rewards/accuracy_reward/std": 0.30924052000045776, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.96923828125, - "rewards/tag_count_reward/std": 0.11769144982099533, + "rewards/tag_count_reward/mean": 0.89990234375, + "rewards/tag_count_reward/std": 0.22418582439422607, "step": 494 }, { @@ -14341,27 +14341,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.00390625, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 866.771484375, - "completions/mean_terminated_length": 862.1392822265625, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1029.697265625, + "completions/mean_terminated_length": 929.1781005859375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.16898523512844585, - "grad_norm": 1.921745777130127, - "kl": 0.212890625, - "learning_rate": 9.87032156543044e-07, - "loss": 0.0083, - "num_tokens": 330589647.0, - "reward": 1.0869140625, - "reward_std": 0.1914043426513672, - "rewards/accuracy_reward/mean": 0.11290322244167328, - "rewards/accuracy_reward/std": 0.3167939782142639, + "grad_norm": 0.1555251181125641, + "kl": 0.03839111328125, + "learning_rate": 9.87149923081722e-07, + "loss": 0.1316, + "num_tokens": 356489592.0, + "reward": 1.0419921875, + "reward_std": 0.295937180519104, + "rewards/accuracy_reward/mean": 0.12903225421905518, + "rewards/accuracy_reward/std": 0.33557409048080444, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9775390625, - "rewards/tag_count_reward/std": 0.08975613117218018, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.2120266854763031, "step": 495 }, { @@ -14370,27 +14370,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1946.0, - "completions/mean_length": 958.35546875, - "completions/mean_terminated_length": 927.7228393554688, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1135.919921875, + "completions/mean_terminated_length": 1058.625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, "epoch": 0.16932661944183663, - "grad_norm": 1.4758943319320679, - "kl": 0.21435546875, - "learning_rate": 9.869040721902213e-07, - "loss": 0.0664, - "num_tokens": 331151141.0, - "reward": 1.16357421875, - "reward_std": 0.24888169765472412, - "rewards/accuracy_reward/mean": 0.193359375, - "rewards/accuracy_reward/std": 0.39531853795051575, + "grad_norm": 0.128618985414505, + "kl": 0.032958984375, + "learning_rate": 9.870223632208875e-07, + "loss": 0.0608, + "num_tokens": 357141999.0, + "reward": 1.15087890625, + "reward_std": 0.32117411494255066, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41380295157432556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.97021484375, - "rewards/tag_count_reward/std": 0.12301874905824661, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.1867896467447281, "step": 496 }, { @@ -14399,27 +14399,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1943.0, - "completions/mean_length": 967.48828125, - "completions/mean_terminated_length": 952.5109252929688, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1160.767578125, + "completions/mean_terminated_length": 1053.989013671875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, "epoch": 0.16966800375522745, - "grad_norm": 0.5236533284187317, - "kl": 0.15771484375, - "learning_rate": 9.86775367731979e-07, - "loss": 0.0074, - "num_tokens": 331723199.0, - "reward": 1.03759765625, - "reward_std": 0.17600062489509583, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.0, + "grad_norm": 0.1433144360780716, + "kl": 0.03558349609375, + "learning_rate": 9.868941826160349e-07, + "loss": 0.0866, + "num_tokens": 357813016.0, + "reward": 1.01806640625, + "reward_std": 0.29755228757858276, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.97509765625, - "rewards/tag_count_reward/std": 0.10009774565696716, + "rewards/tag_count_reward/mean": 0.90087890625, + "rewards/tag_count_reward/std": 0.22188088297843933, "step": 497 }, { @@ -14428,27 +14428,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.00390625, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1974.0, - "completions/mean_length": 909.521484375, - "completions/mean_terminated_length": 905.0569458007812, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1075.751953125, + "completions/mean_terminated_length": 1000.0189208984375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.17000938806861823, - "grad_norm": 0.37404993176460266, - "kl": 0.140869140625, - "learning_rate": 9.866460433509893e-07, - "loss": 0.0024, - "num_tokens": 332269418.0, - "reward": 1.095703125, - "reward_std": 0.19775967299938202, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, + "grad_norm": 0.1315048784017563, + "kl": 0.03363037109375, + "learning_rate": 9.86765381449231e-07, + "loss": 0.0788, + "num_tokens": 358444345.0, + "reward": 1.0986328125, + "reward_std": 0.2701328992843628, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.98046875, - "rewards/tag_count_reward/std": 0.08769373595714569, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.18138417601585388, "step": 498 }, { @@ -14457,27 +14457,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 912.21875, - "completions/mean_terminated_length": 891.8966064453125, - "completions/min_length": 233.0, - "completions/min_terminated_length": 233.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1068.4140625, + "completions/mean_terminated_length": 971.7167358398438, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, "epoch": 0.17035077238200905, - "grad_norm": 2.436751127243042, - "kl": 0.2852783203125, - "learning_rate": 9.865160992308047e-07, - "loss": 0.0769, - "num_tokens": 332812362.0, - "reward": 1.08544921875, - "reward_std": 0.2063451111316681, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, + "grad_norm": 0.13883817195892334, + "kl": 0.03594970703125, + "learning_rate": 9.866359599034239e-07, + "loss": 0.0684, + "num_tokens": 359067261.0, + "reward": 1.03662109375, + "reward_std": 0.2716313302516937, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.97607421875, - "rewards/tag_count_reward/std": 0.10392878204584122, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.20490537583827972, "step": 499 }, { @@ -14486,27 +14486,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 864.638671875, - "completions/mean_terminated_length": 852.9684448242188, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1080.716796875, + "completions/mean_terminated_length": 932.5743408203125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, "epoch": 0.17069215669539983, - "grad_norm": 4.616459369659424, - "kl": 0.337158203125, - "learning_rate": 9.863855355558573e-07, - "loss": 0.0282, - "num_tokens": 333329649.0, - "reward": 1.13720703125, - "reward_std": 0.22087720036506653, - "rewards/accuracy_reward/mean": 0.162109375, - "rewards/accuracy_reward/std": 0.3689115643501282, + "grad_norm": 0.18865686655044556, + "kl": 0.04241943359375, + "learning_rate": 9.865059181624434e-07, + "loss": 0.1159, + "num_tokens": 359695180.0, + "reward": 1.0673828125, + "reward_std": 0.3113729655742645, + "rewards/accuracy_reward/mean": 0.169921875, + "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.97509765625, - "rewards/tag_count_reward/std": 0.10009774565696716, + "rewards/tag_count_reward/mean": 0.8974609375, + "rewards/tag_count_reward/std": 0.23561012744903564, "step": 500 }, { @@ -14515,27 +14515,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.11328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, - "completions/mean_length": 994.232421875, - "completions/mean_terminated_length": 971.0958251953125, - "completions/min_length": 205.0, - "completions/min_terminated_length": 205.0, + "completions/mean_length": 1162.13671875, + "completions/mean_terminated_length": 1048.9647216796875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, "epoch": 0.17103354100879065, - "grad_norm": 1.0512064695358276, - "kl": 0.1864013671875, - "learning_rate": 9.862543525114582e-07, - "loss": 0.028, - "num_tokens": 333918568.0, - "reward": 1.1123046875, - "reward_std": 0.20634591579437256, - "rewards/accuracy_reward/mean": 0.14453125, - "rewards/accuracy_reward/std": 0.35197147727012634, + "grad_norm": 0.1335000842809677, + "kl": 0.03570556640625, + "learning_rate": 9.863752564110003e-07, + "loss": 0.1143, + "num_tokens": 360370066.0, + "reward": 1.080078125, + "reward_std": 0.28146272897720337, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9677734375, - "rewards/tag_count_reward/std": 0.1157233864068985, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.2203473001718521, "step": 501 }, { @@ -14544,27 +14544,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 885.63671875, - "completions/mean_terminated_length": 874.173583984375, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1028.130859375, + "completions/mean_terminated_length": 982.3407592773438, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, "epoch": 0.17137492532218146, - "grad_norm": 0.5130018591880798, - "kl": 0.129638671875, - "learning_rate": 9.861225502837976e-07, - "loss": 0.0264, - "num_tokens": 334445566.0, - "reward": 1.08935546875, - "reward_std": 0.1409814953804016, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, + "grad_norm": 0.13408465683460236, + "kl": 0.03814697265625, + "learning_rate": 9.862439748346854e-07, + "loss": 0.0502, + "num_tokens": 360970021.0, + "reward": 1.08837890625, + "reward_std": 0.19343972206115723, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.97802734375, - "rewards/tag_count_reward/std": 0.0878121480345726, + "rewards/tag_count_reward/mean": 0.95556640625, + "rewards/tag_count_reward/std": 0.14953702688217163, "step": 502 }, { @@ -14573,27 +14573,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.111328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 983.76953125, - "completions/mean_terminated_length": 969.017822265625, - "completions/min_length": 84.0, - "completions/min_terminated_length": 84.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1139.931640625, + "completions/mean_terminated_length": 1026.1737060546875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.17171630963557225, - "grad_norm": 0.9352298378944397, - "kl": 0.1884765625, - "learning_rate": 9.859901290599448e-07, - "loss": 0.0431, - "num_tokens": 335032824.0, - "reward": 1.111328125, - "reward_std": 0.21735718846321106, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, + "grad_norm": 0.18070316314697266, + "kl": 0.036865234375, + "learning_rate": 9.861120736199701e-07, + "loss": 0.1277, + "num_tokens": 361637234.0, + "reward": 1.06201171875, + "reward_std": 0.2937353253364563, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.98046875, - "rewards/tag_count_reward/std": 0.09044018387794495, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.2202296257019043, "step": 503 }, { @@ -14602,27 +14602,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 925.0859375, - "completions/mean_terminated_length": 893.51806640625, - "completions/min_length": 222.0, - "completions/min_terminated_length": 222.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1070.6484375, + "completions/mean_terminated_length": 996.7311401367188, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, "epoch": 0.17205769394896306, - "grad_norm": 2.7850639820098877, - "kl": 0.2857666015625, - "learning_rate": 9.858570890278475e-07, - "loss": 0.0822, - "num_tokens": 335584852.0, - "reward": 1.08203125, - "reward_std": 0.16623443365097046, - "rewards/accuracy_reward/mean": 0.11088709533214569, - "rewards/accuracy_reward/std": 0.3143092691898346, + "grad_norm": 0.12468232959508896, + "kl": 0.037445068359375, + "learning_rate": 9.859795529542069e-07, + "loss": 0.0588, + "num_tokens": 362263790.0, + "reward": 1.0478515625, + "reward_std": 0.23821130394935608, + "rewards/accuracy_reward/mean": 0.11491935700178146, + "rewards/accuracy_reward/std": 0.3192465901374817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.974609375, - "rewards/tag_count_reward/std": 0.11208678781986237, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.18406164646148682, "step": 504 }, { @@ -14631,27 +14631,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 910.5546875, - "completions/mean_terminated_length": 887.8964233398438, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1109.9140625, + "completions/mean_terminated_length": 1023.9061889648438, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.17239907826235384, - "grad_norm": 1.236697793006897, - "kl": 0.231201171875, - "learning_rate": 9.857234303763317e-07, - "loss": 0.0535, - "num_tokens": 336125184.0, - "reward": 1.1552734375, - "reward_std": 0.24600103497505188, - "rewards/accuracy_reward/mean": 0.181640625, - "rewards/accuracy_reward/std": 0.38592514395713806, + "grad_norm": 0.7725427746772766, + "kl": 0.0546875, + "learning_rate": 9.858464130256268e-07, + "loss": 0.1126, + "num_tokens": 362906194.0, + "reward": 1.125, + "reward_std": 0.3140985071659088, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3968288004398346, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9736328125, - "rewards/tag_count_reward/std": 0.09910601377487183, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.1951904594898224, "step": 505 }, { @@ -14660,27 +14660,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.09765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 961.521484375, - "completions/mean_terminated_length": 933.2164306640625, - "completions/min_length": 215.0, - "completions/min_terminated_length": 215.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1094.91796875, + "completions/mean_terminated_length": 991.7705688476562, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.17274046257574466, - "grad_norm": 2.1265530586242676, - "kl": 0.252685546875, - "learning_rate": 9.855891532951015e-07, - "loss": 0.061, - "num_tokens": 336696107.0, - "reward": 1.12548828125, - "reward_std": 0.24798564612865448, - "rewards/accuracy_reward/mean": 0.158203125, - "rewards/accuracy_reward/std": 0.36528825759887695, + "grad_norm": 0.1942758709192276, + "kl": 0.03955078125, + "learning_rate": 9.85712654023341e-07, + "loss": 0.1022, + "num_tokens": 363545416.0, + "reward": 1.11083984375, + "reward_std": 0.2971748113632202, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.96728515625, - "rewards/tag_count_reward/std": 0.1202535331249237, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.20950312912464142, "step": 506 }, { @@ -14689,27 +14689,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 987.826171875, - "completions/mean_terminated_length": 937.9611206054688, - "completions/min_length": 211.0, - "completions/min_terminated_length": 211.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1102.806640625, + "completions/mean_terminated_length": 1005.0280151367188, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.17308184688913544, - "grad_norm": 2.6731009483337402, - "kl": 0.27392578125, - "learning_rate": 9.854542579747383e-07, - "loss": 0.0921, - "num_tokens": 337282306.0, - "reward": 1.083984375, - "reward_std": 0.20409680902957916, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, + "grad_norm": 0.13935035467147827, + "kl": 0.03753662109375, + "learning_rate": 9.855782761373402e-07, + "loss": 0.0977, + "num_tokens": 364190485.0, + "reward": 1.07421875, + "reward_std": 0.2387661635875702, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9609375, - "rewards/tag_count_reward/std": 0.14054512977600098, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.19759400188922882, "step": 507 }, { @@ -14718,27 +14718,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.130859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1937.0, - "completions/mean_length": 967.68359375, - "completions/mean_terminated_length": 939.5390625, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1165.626953125, + "completions/mean_terminated_length": 1032.7752685546875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.17342323120252626, - "grad_norm": 1.2340688705444336, - "kl": 0.247314453125, - "learning_rate": 9.853187446067019e-07, - "loss": 0.0591, - "num_tokens": 337859648.0, - "reward": 1.04248046875, - "reward_std": 0.188123419880867, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, + "grad_norm": 0.21689465641975403, + "kl": 0.04156494140625, + "learning_rate": 9.854432795584938e-07, + "loss": 0.1235, + "num_tokens": 364869174.0, + "reward": 0.9736328125, + "reward_std": 0.27307945489883423, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.97021484375, - "rewards/tag_count_reward/std": 0.11585026234388351, + "rewards/tag_count_reward/mean": 0.8857421875, + "rewards/tag_count_reward/std": 0.2480488121509552, "step": 508 }, { @@ -14747,27 +14747,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.068359375, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 1024.251953125, - "completions/mean_terminated_length": 949.1341552734375, - "completions/min_length": 275.0, - "completions/min_terminated_length": 275.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1150.201171875, + "completions/mean_terminated_length": 1044.34716796875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.17376461551591704, - "grad_norm": 3.208914279937744, - "kl": 0.501953125, - "learning_rate": 9.85182613383328e-07, - "loss": 0.1623, - "num_tokens": 338471009.0, - "reward": 1.00146484375, - "reward_std": 0.2128455489873886, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, + "grad_norm": 0.3246607482433319, + "kl": 0.0452880859375, + "learning_rate": 9.853076644785505e-07, + "loss": 0.1065, + "num_tokens": 365545021.0, + "reward": 1.00048828125, + "reward_std": 0.25012367963790894, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.17780545353889465, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.20922017097473145, "step": 509 }, { @@ -14776,27 +14776,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.076171875, + "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 1120.59765625, - "completions/mean_terminated_length": 1044.131103515625, - "completions/min_length": 227.0, - "completions/min_terminated_length": 227.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1275.62890625, + "completions/mean_terminated_length": 1121.878173828125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, "epoch": 0.17410599982930786, - "grad_norm": 7.5583696365356445, - "kl": 0.53369140625, - "learning_rate": 9.850458644978307e-07, - "loss": 0.0999, - "num_tokens": 339118275.0, - "reward": 1.029296875, - "reward_std": 0.23589910566806793, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, + "grad_norm": 0.1762906163930893, + "kl": 0.034637451171875, + "learning_rate": 9.851714310901365e-07, + "loss": 0.1341, + "num_tokens": 366271663.0, + "reward": 0.9794921875, + "reward_std": 0.28135573863983154, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.17968250811100006, + "rewards/tag_count_reward/mean": 0.8837890625, + "rewards/tag_count_reward/std": 0.24763242900371552, "step": 510 }, { @@ -14805,27 +14805,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.103515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 973.658203125, - "completions/mean_terminated_length": 902.0354614257812, - "completions/min_length": 256.0, - "completions/min_terminated_length": 256.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1071.146484375, + "completions/mean_terminated_length": 958.3507690429688, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.17444738414269864, - "grad_norm": 5.508956432342529, - "kl": 0.451171875, - "learning_rate": 9.849084981442997e-07, - "loss": 0.1656, - "num_tokens": 339696084.0, - "reward": 1.0224609375, - "reward_std": 0.1929110884666443, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, + "grad_norm": 0.21623636782169342, + "kl": 0.04559326171875, + "learning_rate": 9.850345795867576e-07, + "loss": 0.0738, + "num_tokens": 366899386.0, + "reward": 0.994140625, + "reward_std": 0.21640846133232117, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9462890625, - "rewards/tag_count_reward/std": 0.16712692379951477, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.21164102852344513, "step": 511 }, { @@ -14834,27 +14834,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.06640625, + "completions/clipped_ratio": 0.134765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1956.0, - "completions/mean_length": 979.482421875, - "completions/mean_terminated_length": 903.4790649414062, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1172.599609375, + "completions/mean_terminated_length": 1036.2506103515625, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, "epoch": 0.17478876845608946, - "grad_norm": 3.7685906887054443, - "kl": 0.2861328125, - "learning_rate": 9.847705145177013e-07, - "loss": 0.1415, - "num_tokens": 340273419.0, - "reward": 0.98583984375, - "reward_std": 0.19239750504493713, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, + "grad_norm": 0.7232561111450195, + "kl": 0.04559326171875, + "learning_rate": 9.848971101627965e-07, + "loss": 0.1261, + "num_tokens": 367575597.0, + "reward": 0.994140625, + "reward_std": 0.28845974802970886, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.1699187308549881, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.22444620728492737, "step": 512 }, { @@ -14863,27 +14863,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.111328125, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1935.0, - "completions/mean_length": 1070.310546875, - "completions/mean_terminated_length": 947.830810546875, - "completions/min_length": 221.0, - "completions/min_terminated_length": 221.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1181.056640625, + "completions/mean_terminated_length": 1061.611083984375, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, "epoch": 0.17513015276948024, - "grad_norm": 1.9535354375839233, - "kl": 0.4921875, - "learning_rate": 9.84631913813878e-07, - "loss": 0.1717, - "num_tokens": 340905754.0, - "reward": 0.98828125, - "reward_std": 0.2464141696691513, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, + "grad_norm": 0.1659488081932068, + "kl": 0.04522705078125, + "learning_rate": 9.847590230135142e-07, + "loss": 0.1185, + "num_tokens": 368264634.0, + "reward": 1.02197265625, + "reward_std": 0.2936013340950012, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.21487605571746826, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.21692466735839844, "step": 513 }, { @@ -14892,27 +14892,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08984375, + "completions/clipped_ratio": 0.09765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1963.0, - "completions/mean_length": 975.24609375, - "completions/mean_terminated_length": 869.3519287109375, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1108.900390625, + "completions/mean_terminated_length": 1007.2662353515625, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.17547153708287105, - "grad_norm": 3.652279853820801, - "kl": 0.43310546875, - "learning_rate": 9.844926962295487e-07, - "loss": 0.1758, - "num_tokens": 341484072.0, - "reward": 1.0634765625, - "reward_std": 0.2978646457195282, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, + "grad_norm": 0.35106366872787476, + "kl": 0.0496826171875, + "learning_rate": 9.846203183350486e-07, + "loss": 0.12, + "num_tokens": 368911383.0, + "reward": 1.06640625, + "reward_std": 0.3203471004962921, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.19308137893676758, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.20392431318759918, "step": 514 }, { @@ -14921,27 +14921,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.083984375, + "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 974.666015625, - "completions/mean_terminated_length": 876.2579956054688, - "completions/min_length": 237.0, - "completions/min_terminated_length": 237.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1138.67578125, + "completions/mean_terminated_length": 989.8772583007812, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.17581292139626184, - "grad_norm": 3.9282784461975098, - "kl": 0.5498046875, - "learning_rate": 9.843528619623068e-07, - "loss": 0.1777, - "num_tokens": 342060589.0, - "reward": 1.0673828125, - "reward_std": 0.28128862380981445, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, + "grad_norm": 0.33725979924201965, + "kl": 0.0477294921875, + "learning_rate": 9.844809963244153e-07, + "loss": 0.1325, + "num_tokens": 369571873.0, + "reward": 1.05126953125, + "reward_std": 0.29890111088752747, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.1892828494310379, + "rewards/tag_count_reward/mean": 0.88720703125, + "rewards/tag_count_reward/std": 0.2373974323272705, "step": 515 }, { @@ -14950,27 +14950,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.123046875, + "completions/clipped_ratio": 0.12890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 1088.9453125, - "completions/mean_terminated_length": 954.378662109375, - "completions/min_length": 295.0, - "completions/min_terminated_length": 295.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1216.09375, + "completions/mean_terminated_length": 1092.986572265625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, "epoch": 0.17615430570965265, - "grad_norm": 10.424691200256348, - "kl": 0.84375, - "learning_rate": 9.842124112106214e-07, - "loss": 0.2623, - "num_tokens": 342701937.0, - "reward": 1.0419921875, - "reward_std": 0.294842392206192, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, + "grad_norm": 129.20872497558594, + "kl": 1.166259765625, + "learning_rate": 9.84341057179506e-07, + "loss": 0.1696, + "num_tokens": 370278321.0, + "reward": 1.07080078125, + "reward_std": 0.3014245629310608, + "rewards/accuracy_reward/mean": 0.173828125, + "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9111328125, - "rewards/tag_count_reward/std": 0.2142503559589386, + "rewards/tag_count_reward/mean": 0.89697265625, + "rewards/tag_count_reward/std": 0.22612106800079346, "step": 516 }, { @@ -14979,27 +14979,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.150390625, + "completions/clipped_ratio": 0.1796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 1101.529296875, - "completions/mean_terminated_length": 933.9931030273438, - "completions/min_length": 230.0, - "completions/min_terminated_length": 230.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1260.37109375, + "completions/mean_terminated_length": 1087.8428955078125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.17649569002304344, - "grad_norm": 197.44442749023438, - "kl": 2.42919921875, - "learning_rate": 9.84071344173837e-07, - "loss": 0.2986, - "num_tokens": 343343216.0, - "reward": 0.97265625, - "reward_std": 0.2862517833709717, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, + "grad_norm": 13.521764755249023, + "kl": 0.1954345703125, + "learning_rate": 9.8420050109909e-07, + "loss": 0.1263, + "num_tokens": 371000927.0, + "reward": 0.9501953125, + "reward_std": 0.27213138341903687, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89453125, - "rewards/tag_count_reward/std": 0.23431077599525452, + "rewards/tag_count_reward/mean": 0.8779296875, + "rewards/tag_count_reward/std": 0.24228043854236603, "step": 517 }, { @@ -15008,27 +15008,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.15234375, + "completions/clipped_ratio": 0.208984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 1096.52734375, - "completions/mean_terminated_length": 925.5253295898438, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1248.27734375, + "completions/mean_terminated_length": 1036.99267578125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.17683707433643425, - "grad_norm": 5.872008323669434, - "kl": 0.8203125, - "learning_rate": 9.839296610521723e-07, - "loss": 0.2026, - "num_tokens": 343989006.0, - "reward": 0.95263671875, - "reward_std": 0.2677186131477356, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, + "grad_norm": 0.49862024188041687, + "kl": 0.05731201171875, + "learning_rate": 9.840593282828121e-07, + "loss": 0.1197, + "num_tokens": 371724413.0, + "reward": 0.94921875, + "reward_std": 0.31343209743499756, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89208984375, - "rewards/tag_count_reward/std": 0.23657894134521484, + "rewards/tag_count_reward/mean": 0.865234375, + "rewards/tag_count_reward/std": 0.2643204629421234, "step": 518 }, { @@ -15037,27 +15037,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12890625, + "completions/clipped_ratio": 0.162109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1954.0, - "completions/mean_length": 1106.880859375, - "completions/mean_terminated_length": 967.6121826171875, - "completions/min_length": 218.0, - "completions/min_terminated_length": 218.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1254.5859375, + "completions/mean_terminated_length": 1101.0816650390625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.17717845864982504, - "grad_norm": 7.19550895690918, - "kl": 0.6943359375, - "learning_rate": 9.837873620467203e-07, - "loss": 0.1934, - "num_tokens": 344637665.0, - "reward": 0.98095703125, - "reward_std": 0.2884325683116913, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, + "grad_norm": 0.562168538570404, + "kl": 0.048095703125, + "learning_rate": 9.839175389311934e-07, + "loss": 0.1291, + "num_tokens": 372448697.0, + "reward": 0.9697265625, + "reward_std": 0.30398112535476685, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90087890625, - "rewards/tag_count_reward/std": 0.2294684797525406, + "rewards/tag_count_reward/mean": 0.8740234375, + "rewards/tag_count_reward/std": 0.2536408603191376, "step": 519 }, { @@ -15066,27 +15066,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.11328125, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 1099.63671875, - "completions/mean_terminated_length": 978.4801635742188, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1157.724609375, + "completions/mean_terminated_length": 1050.579833984375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.17751984296321585, - "grad_norm": 1.9427558183670044, - "kl": 0.462890625, - "learning_rate": 9.836444473594488e-07, - "loss": 0.1731, - "num_tokens": 345280039.0, - "reward": 0.97802734375, - "reward_std": 0.24953964352607727, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 0.4277644455432892, + "kl": 0.0462646484375, + "learning_rate": 9.837751332456306e-07, + "loss": 0.1201, + "num_tokens": 373120812.0, + "reward": 0.9931640625, + "reward_std": 0.22299577295780182, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.21244709193706512, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.18938377499580383, "step": 520 }, { @@ -15095,27 +15095,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0859375, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 935.283203125, - "completions/mean_terminated_length": 830.6688232421875, - "completions/min_length": 201.0, - "completions/min_terminated_length": 201.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1010.0625, + "completions/mean_terminated_length": 907.6051635742188, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.17786122727660664, - "grad_norm": 4.585904121398926, - "kl": 0.41796875, - "learning_rate": 9.83500917193199e-07, - "loss": 0.189, - "num_tokens": 345842104.0, - "reward": 1.01220703125, - "reward_std": 0.2607251703739166, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, + "grad_norm": 0.30204182863235474, + "kl": 0.05279541015625, + "learning_rate": 9.83632111428396e-07, + "loss": 0.0776, + "num_tokens": 373721164.0, + "reward": 1.0078125, + "reward_std": 0.2607133090496063, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.18939071893692017, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.1767766922712326, "step": 521 }, { @@ -15124,27 +15124,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.138671875, + "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1953.0, - "completions/mean_length": 1115.462890625, - "completions/mean_terminated_length": 965.3265380859375, - "completions/min_length": 204.0, - "completions/min_terminated_length": 204.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1175.888671875, + "completions/mean_terminated_length": 1077.3021240234375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.17820261158999745, - "grad_norm": 3.002474546432495, - "kl": 0.43408203125, - "learning_rate": 9.833567717516856e-07, - "loss": 0.1894, - "num_tokens": 346490661.0, - "reward": 0.95751953125, - "reward_std": 0.25489163398742676, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, + "grad_norm": 0.3688586354255676, + "kl": 0.04742431640625, + "learning_rate": 9.834884736826366e-07, + "loss": 0.1072, + "num_tokens": 374400659.0, + "reward": 0.99072265625, + "reward_std": 0.19481873512268066, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89697265625, - "rewards/tag_count_reward/std": 0.2309378832578659, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.19380934536457062, "step": 522 }, { @@ -15153,27 +15153,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.09765625, + "completions/clipped_ratio": 0.11328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1051.08984375, - "completions/mean_terminated_length": 943.1991577148438, - "completions/min_length": 229.0, - "completions/min_terminated_length": 229.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1126.701171875, + "completions/mean_terminated_length": 1009.0021362304688, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, "epoch": 0.17854399590338824, - "grad_norm": 1.89009428024292, - "kl": 0.46484375, - "learning_rate": 9.832120112394969e-07, - "loss": 0.1646, - "num_tokens": 347102707.0, - "reward": 0.97509765625, - "reward_std": 0.18446403741836548, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, + "grad_norm": 0.31846165657043457, + "kl": 0.04681396484375, + "learning_rate": 9.833442202123754e-07, + "loss": 0.1067, + "num_tokens": 375051418.0, + "reward": 0.98486328125, + "reward_std": 0.18726766109466553, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.18486134707927704, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.1818806678056717, "step": 523 }, { @@ -15182,27 +15182,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12890625, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1040.197265625, - "completions/mean_terminated_length": 891.0606079101562, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1098.70703125, + "completions/mean_terminated_length": 967.9155883789062, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.17888538021677905, - "grad_norm": 10.38375473022461, - "kl": 0.623779296875, - "learning_rate": 9.830666358620936e-07, - "loss": 0.1913, - "num_tokens": 347709880.0, - "reward": 1.064453125, - "reward_std": 0.3105708956718445, - "rewards/accuracy_reward/mean": 0.154296875, - "rewards/accuracy_reward/std": 0.36158639192581177, + "grad_norm": 0.20927104353904724, + "kl": 0.04705810546875, + "learning_rate": 9.831993512225084e-07, + "loss": 0.0805, + "num_tokens": 375688548.0, + "reward": 1.07080078125, + "reward_std": 0.2962263524532318, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91015625, - "rewards/tag_count_reward/std": 0.21836963295936584, + "rewards/tag_count_reward/mean": 0.91064453125, + "rewards/tag_count_reward/std": 0.21772928535938263, "step": 524 }, { @@ -15211,27 +15211,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.09375, + "completions/clipped_ratio": 0.09765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1997.0, - "completions/mean_length": 1002.6796875, - "completions/mean_terminated_length": 894.5430908203125, - "completions/min_length": 248.0, - "completions/min_terminated_length": 248.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1083.947265625, + "completions/mean_terminated_length": 979.612548828125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.17922676453016984, - "grad_norm": 3.3788344860076904, - "kl": 0.5263671875, - "learning_rate": 9.829206458258097e-07, - "loss": 0.1746, - "num_tokens": 348304372.0, - "reward": 1.041015625, - "reward_std": 0.23436088860034943, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, + "grad_norm": 0.3787153661251068, + "kl": 0.06329345703125, + "learning_rate": 9.830538669188068e-07, + "loss": 0.1176, + "num_tokens": 376324649.0, + "reward": 1.0380859375, + "reward_std": 0.2227935642004013, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.19635210931301117, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.18003050982952118, "step": 525 }, { @@ -15240,27 +15240,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.107421875, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 1075.69921875, - "completions/mean_terminated_length": 958.6827392578125, - "completions/min_length": 220.0, - "completions/min_terminated_length": 220.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1142.720703125, + "completions/mean_terminated_length": 1042.570556640625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.17956814884356065, - "grad_norm": 19.488237380981445, - "kl": 0.97705078125, - "learning_rate": 9.827740413378513e-07, - "loss": 0.1829, - "num_tokens": 348933850.0, - "reward": 1.02294921875, - "reward_std": 0.2862394452095032, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, + "grad_norm": 6.252066135406494, + "kl": 0.1630859375, + "learning_rate": 9.829077675079162e-07, + "loss": 0.1408, + "num_tokens": 376988442.0, + "reward": 1.009765625, + "reward_std": 0.279976487159729, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.21517324447631836, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.19534705579280853, "step": 526 }, { @@ -15269,27 +15269,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.103515625, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 1089.953125, - "completions/mean_terminated_length": 979.3289794921875, - "completions/min_length": 312.0, - "completions/min_terminated_length": 312.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1216.142578125, + "completions/mean_terminated_length": 1105.718994140625, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, "epoch": 0.17990953315695143, - "grad_norm": 2.0587828159332275, - "kl": 0.36572265625, - "learning_rate": 9.826268226062967e-07, - "loss": 0.1547, - "num_tokens": 349574642.0, - "reward": 1.0166015625, - "reward_std": 0.26578330993652344, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, + "grad_norm": 536.9520874023438, + "kl": 4.33642578125, + "learning_rate": 9.827610531973547e-07, + "loss": 0.3018, + "num_tokens": 377693843.0, + "reward": 1.005859375, + "reward_std": 0.2589573860168457, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.19785255193710327, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.1995285600423813, "step": 527 }, { @@ -15298,27 +15298,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 962.0703125, - "completions/mean_terminated_length": 899.2479248046875, - "completions/min_length": 255.0, - "completions/min_terminated_length": 255.0, - "epoch": 0.18025091747034225, - "grad_norm": 18.152191162109375, - "kl": 0.585205078125, - "learning_rate": 9.82478989840096e-07, - "loss": 0.1541, - "num_tokens": 350138598.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1084.8671875, + "completions/mean_terminated_length": 1005.4545288085938, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.18025091747034225, + "grad_norm": 17.69856834411621, + "kl": 0.5198974609375, + "learning_rate": 9.826137241955148e-07, + "loss": 0.1135, + "num_tokens": 378320671.0, "reward": 1.10009765625, - "reward_std": 0.1812194138765335, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, + "reward_std": 0.23680847883224487, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.95947265625, - "rewards/tag_count_reward/std": 0.15145450830459595, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.17257477343082428, "step": 528 }, { @@ -15327,27 +15327,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08984375, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1021.46875, - "completions/mean_terminated_length": 920.1373291015625, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1059.419921875, + "completions/mean_terminated_length": 973.365234375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.18059230178373303, - "grad_norm": 5.078516483306885, - "kl": 0.38037109375, - "learning_rate": 9.823305432490705e-07, - "loss": 0.1617, - "num_tokens": 350734054.0, - "reward": 1.02099609375, - "reward_std": 0.24846667051315308, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, + "grad_norm": 1183.978271484375, + "kl": 12.9266357421875, + "learning_rate": 9.824657807116617e-07, + "loss": 0.6084, + "num_tokens": 378935558.0, + "reward": 1.04150390625, + "reward_std": 0.23618459701538086, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19895724952220917, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.18878936767578125, "step": 529 }, { @@ -15356,27 +15356,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.134765625, + "completions/clipped_ratio": 0.15234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1131.94140625, - "completions/mean_terminated_length": 989.2596435546875, - "completions/min_length": 283.0, - "completions/min_terminated_length": 283.0, + "completions/mean_length": 1265.384765625, + "completions/mean_terminated_length": 1124.73046875, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, "epoch": 0.18093368609712385, - "grad_norm": 5.4803643226623535, - "kl": 0.34765625, - "learning_rate": 9.821814830439133e-07, - "loss": 0.1819, - "num_tokens": 351392728.0, - "reward": 0.98876953125, - "reward_std": 0.28743964433670044, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, + "grad_norm": 0.5579692721366882, + "kl": 0.05126953125, + "learning_rate": 9.823172229559335e-07, + "loss": 0.1222, + "num_tokens": 379662555.0, + "reward": 1.02197265625, + "reward_std": 0.29389214515686035, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89892578125, - "rewards/tag_count_reward/std": 0.2390739917755127, + "rewards/tag_count_reward/mean": 0.89697265625, + "rewards/tag_count_reward/std": 0.22340019047260284, "step": 530 }, { @@ -15385,27 +15385,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.076171875, + "completions/clipped_ratio": 0.103515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 952.357421875, - "completions/mean_terminated_length": 862.01904296875, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1084.130859375, + "completions/mean_terminated_length": 972.8344116210938, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.18127507041051463, - "grad_norm": 6.403390407562256, - "kl": 0.372802734375, - "learning_rate": 9.820318094361883e-07, - "loss": 0.1543, - "num_tokens": 351960927.0, + "grad_norm": 0.4010159969329834, + "kl": 0.04547119140625, + "learning_rate": 9.821680511393407e-07, + "loss": 0.0922, + "num_tokens": 380298222.0, "reward": 1.01171875, - "reward_std": 0.2463240623474121, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, + "reward_std": 0.2094048112630844, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.19829878211021423, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.18033012747764587, "step": 531 }, { @@ -15414,27 +15414,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.09375, + "completions/clipped_ratio": 0.162109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 1078.537109375, - "completions/mean_terminated_length": 978.2478637695312, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1217.08203125, + "completions/mean_terminated_length": 1056.3216552734375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.18161645472390545, - "grad_norm": 6.348297595977783, - "kl": 0.53515625, - "learning_rate": 9.81881522638329e-07, - "loss": 0.2102, - "num_tokens": 352597138.0, - "reward": 1.02978515625, - "reward_std": 0.2722817659378052, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, + "grad_norm": 0.3693610429763794, + "kl": 0.04864501953125, + "learning_rate": 9.82018265473766e-07, + "loss": 0.1253, + "num_tokens": 381005368.0, + "reward": 1.02197265625, + "reward_std": 0.29467299580574036, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.21247407793998718, + "rewards/tag_count_reward/mean": 0.88525390625, + "rewards/tag_count_reward/std": 0.24359171092510223, "step": 532 }, { @@ -15443,27 +15443,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.16015625, + "completions/clipped_ratio": 0.1484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 1175.853515625, - "completions/mean_terminated_length": 1009.5372314453125, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1280.248046875, + "completions/mean_terminated_length": 1146.419677734375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.18195783903729623, - "grad_norm": 3.549447536468506, - "kl": 0.7392578125, - "learning_rate": 9.817306228636411e-07, - "loss": 0.2242, - "num_tokens": 353274087.0, - "reward": 0.94091796875, - "reward_std": 0.31294119358062744, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, + "grad_norm": 0.48421764373779297, + "kl": 0.04736328125, + "learning_rate": 9.818678661719642e-07, + "loss": 0.1214, + "num_tokens": 381735767.0, + "reward": 0.99462890625, + "reward_std": 0.30959808826446533, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87255859375, - "rewards/tag_count_reward/std": 0.2565080523490906, + "rewards/tag_count_reward/mean": 0.90087890625, + "rewards/tag_count_reward/std": 0.21854843199253082, "step": 533 }, { @@ -15472,27 +15472,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.123046875, + "completions/clipped_ratio": 0.12890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1033.978515625, - "completions/mean_terminated_length": 891.6993408203125, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1142.37109375, + "completions/mean_terminated_length": 1008.3543090820312, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.18229922335068705, - "grad_norm": 69.35165405273438, - "kl": 1.7158203125, - "learning_rate": 9.815791103262981e-07, - "loss": 0.2082, - "num_tokens": 353872972.0, - "reward": 1.017578125, - "reward_std": 0.3158687949180603, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, + "grad_norm": 0.2830883264541626, + "kl": 0.066650390625, + "learning_rate": 9.817168534475617e-07, + "loss": 0.111, + "num_tokens": 382390149.0, + "reward": 1.03271484375, + "reward_std": 0.2560274004936218, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.908203125, - "rewards/tag_count_reward/std": 0.22852177917957306, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.21779070794582367, "step": 534 }, { @@ -15501,27 +15501,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.134765625, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 1086.748046875, - "completions/mean_terminated_length": 937.027099609375, - "completions/min_length": 314.0, - "completions/min_terminated_length": 314.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1169.939453125, + "completions/mean_terminated_length": 1053.3826904296875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, "epoch": 0.18264060766407783, - "grad_norm": 26.75957679748535, - "kl": 1.73095703125, - "learning_rate": 9.814269852413453e-07, - "loss": 0.2815, - "num_tokens": 354504011.0, - "reward": 1.060546875, - "reward_std": 0.3185046911239624, - "rewards/accuracy_reward/mean": 0.1713709682226181, - "rewards/accuracy_reward/std": 0.3772132694721222, + "grad_norm": 0.4295150339603424, + "kl": 0.06268310546875, + "learning_rate": 9.81565227515056e-07, + "loss": 0.0992, + "num_tokens": 383063782.0, + "reward": 1.10595703125, + "reward_std": 0.25862962007522583, + "rewards/accuracy_reward/mean": 0.20967741310596466, + "rewards/accuracy_reward/std": 0.4074893593788147, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89453125, - "rewards/tag_count_reward/std": 0.24049316346645355, + "rewards/tag_count_reward/mean": 0.90283203125, + "rewards/tag_count_reward/std": 0.2166205197572708, "step": 535 }, { @@ -15530,27 +15530,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1171875, + "completions/clipped_ratio": 0.13671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 1036.91796875, - "completions/mean_terminated_length": 902.7035522460938, - "completions/min_length": 215.0, - "completions/min_terminated_length": 215.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1146.947265625, + "completions/mean_terminated_length": 1004.2466430664062, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.18298199197746864, - "grad_norm": 10.085936546325684, - "kl": 0.70849609375, - "learning_rate": 9.812742478246957e-07, - "loss": 0.2268, - "num_tokens": 355112705.0, + "grad_norm": 2.3532238006591797, + "kl": 0.118408203125, + "learning_rate": 9.814129885898154e-07, + "loss": 0.1271, + "num_tokens": 383728811.0, "reward": 0.99169921875, - "reward_std": 0.2692239582538605, - "rewards/accuracy_reward/mean": 0.08467742055654526, - "rewards/accuracy_reward/std": 0.278682142496109, + "reward_std": 0.21569226682186127, + "rewards/accuracy_reward/mean": 0.0786290317773819, + "rewards/accuracy_reward/std": 0.26943066716194153, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.2272297441959381, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.1967647224664688, "step": 536 }, { @@ -15559,27 +15559,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.177734375, + "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1936.0, - "completions/mean_length": 1110.74609375, - "completions/mean_terminated_length": 908.1567993164062, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1164.4140625, + "completions/mean_terminated_length": 1019.8272705078125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, "epoch": 0.18332337629085943, - "grad_norm": 110.41934204101562, - "kl": 2.91796875, - "learning_rate": 9.811208982931327e-07, - "loss": 0.373, - "num_tokens": 355753231.0, - "reward": 0.94580078125, - "reward_std": 0.2905876040458679, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 0.5218260288238525, + "kl": 0.0560302734375, + "learning_rate": 9.812601368880796e-07, + "loss": 0.076, + "num_tokens": 384396815.0, + "reward": 0.98486328125, + "reward_std": 0.2580307126045227, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87939453125, - "rewards/tag_count_reward/std": 0.2550473213195801, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.21692466735839844, "step": 537 }, { @@ -15588,27 +15588,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.130859375, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1964.0, - "completions/mean_length": 1066.75, - "completions/mean_terminated_length": 919.01123046875, - "completions/min_length": 264.0, - "completions/min_terminated_length": 264.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1130.181640625, + "completions/mean_terminated_length": 1017.4671020507812, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, "epoch": 0.18366476060425024, - "grad_norm": 70.96749114990234, - "kl": 4.224609375, - "learning_rate": 9.809669368643075e-07, - "loss": 0.3807, - "num_tokens": 356369455.0, - "reward": 1.0087890625, - "reward_std": 0.3199254274368286, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, + "grad_norm": 20.854175567626953, + "kl": 0.41790771484375, + "learning_rate": 9.811066726269582e-07, + "loss": 0.1362, + "num_tokens": 385045516.0, + "reward": 1.06201171875, + "reward_std": 0.24447444081306458, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8994140625, - "rewards/tag_count_reward/std": 0.23953568935394287, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.19449345767498016, "step": 538 }, { @@ -15617,27 +15617,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2109375, + "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1930.0, - "completions/mean_length": 1183.3671875, - "completions/mean_terminated_length": 952.2277221679688, - "completions/min_length": 212.0, - "completions/min_terminated_length": 212.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1200.7578125, + "completions/mean_terminated_length": 1032.10302734375, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, "epoch": 0.18400614491764103, - "grad_norm": 609.8837890625, - "kl": 26.34375, - "learning_rate": 9.808123637567406e-07, - "loss": 1.2569, - "num_tokens": 357051515.0, - "reward": 0.9091796875, - "reward_std": 0.3217463791370392, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, + "grad_norm": 0.5710415244102478, + "kl": 0.07281494140625, + "learning_rate": 9.809525960244308e-07, + "loss": 0.119, + "num_tokens": 385736480.0, + "reward": 1.00439453125, + "reward_std": 0.28465235233306885, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8447265625, - "rewards/tag_count_reward/std": 0.28594186902046204, + "rewards/tag_count_reward/mean": 0.89111328125, + "rewards/tag_count_reward/std": 0.23561164736747742, "step": 539 }, { @@ -15646,27 +15646,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.181640625, + "completions/clipped_ratio": 0.111328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 1120.263671875, - "completions/mean_terminated_length": 914.3460693359375, - "completions/min_length": 246.0, - "completions/min_terminated_length": 246.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1132.79296875, + "completions/mean_terminated_length": 1018.1406860351562, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.18434752923103184, - "grad_norm": 276.126220703125, - "kl": 10.609375, - "learning_rate": 9.806571791898196e-07, - "loss": 0.6705, - "num_tokens": 357698066.0, - "reward": 0.92431640625, - "reward_std": 0.2812725007534027, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, + "grad_norm": 3.235412836074829, + "kl": 0.10955810546875, + "learning_rate": 9.807979072993469e-07, + "loss": 0.0988, + "num_tokens": 386389446.0, + "reward": 1.001953125, + "reward_std": 0.21348832547664642, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86962890625, - "rewards/tag_count_reward/std": 0.2690318524837494, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.17862646281719208, "step": 540 }, { @@ -15675,27 +15675,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.19921875, + "completions/clipped_ratio": 0.146484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 1167.890625, - "completions/mean_terminated_length": 948.9365844726562, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1231.66796875, + "completions/mean_terminated_length": 1091.565185546875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.18468891354442263, - "grad_norm": 28.768630981445312, - "kl": 2.984375, - "learning_rate": 9.805013833838014e-07, - "loss": 0.3308, - "num_tokens": 358376490.0, - "reward": 0.96142578125, - "reward_std": 0.2962723672389984, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, + "grad_norm": 0.3809445798397064, + "kl": 0.07183837890625, + "learning_rate": 9.806426066714256e-07, + "loss": 0.1247, + "num_tokens": 387100524.0, + "reward": 1.01123046875, + "reward_std": 0.2569984197616577, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86767578125, - "rewards/tag_count_reward/std": 0.26207634806632996, + "rewards/tag_count_reward/mean": 0.90185546875, + "rewards/tag_count_reward/std": 0.21561242640018463, "step": 541 }, { @@ -15704,27 +15704,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, + "completions/clipped_ratio": 0.12890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 986.94921875, - "completions/mean_terminated_length": 835.3705444335938, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1074.423828125, + "completions/mean_terminated_length": 930.35205078125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.18503029785781344, - "grad_norm": 17.48018455505371, - "kl": 1.12109375, - "learning_rate": 9.80344976559809e-07, - "loss": 0.1957, - "num_tokens": 358964896.0, - "reward": 1.05078125, - "reward_std": 0.2224898636341095, - "rewards/accuracy_reward/mean": 0.13671875, - "rewards/accuracy_reward/std": 0.3438861668109894, + "grad_norm": 8.068211555480957, + "kl": 0.2899169921875, + "learning_rate": 9.804866943612547e-07, + "loss": 0.1111, + "num_tokens": 387733717.0, + "reward": 1.0380859375, + "reward_std": 0.22613106667995453, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.21770349144935608, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.2204751819372177, "step": 542 }, { @@ -15733,27 +15733,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.158203125, + "completions/clipped_ratio": 0.130859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1151.96875, - "completions/mean_terminated_length": 983.5730590820312, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1209.01953125, + "completions/mean_terminated_length": 1082.701171875, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, "epoch": 0.18537168217120423, - "grad_norm": 12.866509437561035, - "kl": 1.9130859375, - "learning_rate": 9.801879589398338e-07, - "loss": 0.281, - "num_tokens": 359636288.0, - "reward": 0.99462890625, - "reward_std": 0.33961647748947144, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, + "grad_norm": 1.091995358467102, + "kl": 0.073486328125, + "learning_rate": 9.803301705902917e-07, + "loss": 0.1413, + "num_tokens": 388434319.0, + "reward": 1.02978515625, + "reward_std": 0.2876089811325073, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88134765625, - "rewards/tag_count_reward/std": 0.25162631273269653, + "rewards/tag_count_reward/mean": 0.91064453125, + "rewards/tag_count_reward/std": 0.2085477113723755, "step": 543 }, { @@ -15762,27 +15762,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1328125, + "completions/clipped_ratio": 0.123046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 1039.720703125, - "completions/mean_terminated_length": 885.299560546875, - "completions/min_length": 243.0, - "completions/min_terminated_length": 243.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1146.916015625, + "completions/mean_terminated_length": 1020.4833374023438, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, "epoch": 0.18571306648459504, - "grad_norm": 27.830080032348633, - "kl": 2.115234375, - "learning_rate": 9.80030330746733e-07, - "loss": 0.2714, - "num_tokens": 360241009.0, - "reward": 1.12841796875, - "reward_std": 0.3509364724159241, - "rewards/accuracy_reward/mean": 0.2265625, - "rewards/accuracy_reward/std": 0.4190165400505066, + "grad_norm": 0.3270891606807709, + "kl": 0.0604248046875, + "learning_rate": 9.801730355808616e-07, + "loss": 0.1041, + "num_tokens": 389093924.0, + "reward": 1.146484375, + "reward_std": 0.3347158432006836, + "rewards/accuracy_reward/mean": 0.23828125, + "rewards/accuracy_reward/std": 0.42644867300987244, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.23722027242183685, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.21699129045009613, "step": 544 }, { @@ -15791,27 +15791,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.169921875, + "completions/clipped_ratio": 0.11328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 1075.21484375, - "completions/mean_terminated_length": 876.0799560546875, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1102.109375, + "completions/mean_terminated_length": 981.2686767578125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.18605445079798583, - "grad_norm": 20.0777645111084, - "kl": 1.845703125, - "learning_rate": 9.798720922042316e-07, - "loss": 0.2914, - "num_tokens": 360873951.0, - "reward": 1.0576171875, - "reward_std": 0.3573155999183655, - "rewards/accuracy_reward/mean": 0.1796875, - "rewards/accuracy_reward/std": 0.38430243730545044, + "grad_norm": 0.3841545283794403, + "kl": 0.0579833984375, + "learning_rate": 9.80015289556158e-07, + "loss": 0.0828, + "num_tokens": 389740636.0, + "reward": 1.15673828125, + "reward_std": 0.3368246555328369, + "rewards/accuracy_reward/mean": 0.228515625, + "rewards/accuracy_reward/std": 0.4202871024608612, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8779296875, - "rewards/tag_count_reward/std": 0.2507156431674957, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.18131764233112335, "step": 545 }, { @@ -15820,27 +15820,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.138671875, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 1059.560546875, - "completions/mean_terminated_length": 900.424072265625, - "completions/min_length": 201.0, - "completions/min_terminated_length": 201.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1073.97265625, + "completions/mean_terminated_length": 954.3552856445312, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.18639583511137664, - "grad_norm": 6.073132514953613, - "kl": 1.2763671875, - "learning_rate": 9.7971324353692e-07, - "loss": 0.225, - "num_tokens": 361494558.0, - "reward": 1.1376953125, - "reward_std": 0.3568900227546692, - "rewards/accuracy_reward/mean": 0.2421875, - "rewards/accuracy_reward/std": 0.42882615327835083, + "grad_norm": 0.6618931293487549, + "kl": 0.065673828125, + "learning_rate": 9.798569327402428e-07, + "loss": 0.0987, + "num_tokens": 390368622.0, + "reward": 1.1953125, + "reward_std": 0.3582373261451721, + "rewards/accuracy_reward/mean": 0.271484375, + "rewards/accuracy_reward/std": 0.44516023993492126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8955078125, - "rewards/tag_count_reward/std": 0.2331804633140564, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.1948670893907547, "step": 546 }, { @@ -15849,27 +15849,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.171875, + "completions/clipped_ratio": 0.19140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1898.0, - "completions/mean_length": 1169.859375, - "completions/mean_terminated_length": 987.603759765625, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1294.09375, + "completions/mean_terminated_length": 1115.6328125, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.18673721942476743, - "grad_norm": 16.36454963684082, - "kl": 1.921875, - "learning_rate": 9.795537849702546e-07, - "loss": 0.3132, - "num_tokens": 362170822.0, - "reward": 0.95947265625, - "reward_std": 0.3096523880958557, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, + "grad_norm": 0.7665673494338989, + "kl": 0.069091796875, + "learning_rate": 9.79697965358045e-07, + "loss": 0.136, + "num_tokens": 391108494.0, + "reward": 0.9814453125, + "reward_std": 0.29430800676345825, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87353515625, - "rewards/tag_count_reward/std": 0.26310616731643677, + "rewards/tag_count_reward/mean": 0.8720703125, + "rewards/tag_count_reward/std": 0.2555474638938904, "step": 547 }, { @@ -15878,27 +15878,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.123046875, + "completions/clipped_ratio": 0.134765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 1086.716796875, - "completions/mean_terminated_length": 951.8374633789062, - "completions/min_length": 249.0, - "completions/min_terminated_length": 249.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1177.345703125, + "completions/mean_terminated_length": 1041.7359619140625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.18707860373815824, - "grad_norm": 6.874122619628906, - "kl": 0.923828125, - "learning_rate": 9.79393716730558e-07, - "loss": 0.1929, - "num_tokens": 362812645.0, - "reward": 1.04736328125, - "reward_std": 0.2651546001434326, - "rewards/accuracy_reward/mean": 0.142578125, - "rewards/accuracy_reward/std": 0.3499840497970581, + "grad_norm": 0.4188109040260315, + "kl": 0.060302734375, + "learning_rate": 9.795383876353606e-07, + "loss": 0.0912, + "num_tokens": 391796719.0, + "reward": 1.0732421875, + "reward_std": 0.28261393308639526, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90478515625, - "rewards/tag_count_reward/std": 0.21860963106155396, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.2087017148733139, "step": 548 }, { @@ -15907,27 +15907,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.16015625, + "completions/clipped_ratio": 0.13671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1167.39453125, - "completions/mean_terminated_length": 999.465087890625, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1203.34765625, + "completions/mean_terminated_length": 1069.5792236328125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, "epoch": 0.18741998805154902, - "grad_norm": 9.17322826385498, - "kl": 1.46875, - "learning_rate": 9.792330390450179e-07, - "loss": 0.2445, - "num_tokens": 363479071.0, - "reward": 0.95166015625, - "reward_std": 0.318506121635437, - "rewards/accuracy_reward/mean": 0.07459677755832672, - "rewards/accuracy_reward/std": 0.263004869222641, + "grad_norm": 5.095983028411865, + "kl": 0.1500244140625, + "learning_rate": 9.793781997988532e-07, + "loss": 0.1039, + "num_tokens": 392481553.0, + "reward": 1.02392578125, + "reward_std": 0.2649180591106415, + "rewards/accuracy_reward/mean": 0.10685484111309052, + "rewards/accuracy_reward/std": 0.3092404901981354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87939453125, - "rewards/tag_count_reward/std": 0.24873501062393188, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.19569343328475952, "step": 549 }, { @@ -15936,27 +15936,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.17578125, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 1178.5703125, - "completions/mean_terminated_length": 993.14697265625, - "completions/min_length": 201.0, - "completions/min_terminated_length": 201.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1180.8046875, + "completions/mean_terminated_length": 1061.324462890625, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, "epoch": 0.18776137236493984, - "grad_norm": 8.53243350982666, - "kl": 1.380859375, - "learning_rate": 9.790717521416865e-07, - "loss": 0.2552, - "num_tokens": 364156995.0, - "reward": 0.99951171875, - "reward_std": 0.3222518563270569, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, + "grad_norm": 0.5860349535942078, + "kl": 0.07501220703125, + "learning_rate": 9.792174020760524e-07, + "loss": 0.0843, + "num_tokens": 393160621.0, + "reward": 1.099609375, + "reward_std": 0.2811385691165924, + "rewards/accuracy_reward/mean": 0.177734375, + "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87060546875, - "rewards/tag_count_reward/std": 0.2612071633338928, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.19091396033763885, "step": 550 }, { @@ -15965,27 +15965,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1897.0, - "completions/mean_length": 1103.326171875, - "completions/mean_terminated_length": 968.372802734375, - "completions/min_length": 255.0, - "completions/min_terminated_length": 255.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1117.69921875, + "completions/mean_terminated_length": 1043.1180419921875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.18810275667833062, - "grad_norm": 10.166664123535156, - "kl": 1.447265625, - "learning_rate": 9.789098562494813e-07, - "loss": 0.2518, - "num_tokens": 364802682.0, - "reward": 0.96435546875, - "reward_std": 0.30384162068367004, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 0.5190368890762329, + "kl": 0.076904296875, + "learning_rate": 9.790559946953549e-07, + "loss": 0.0743, + "num_tokens": 393813667.0, + "reward": 1.04248046875, + "reward_std": 0.26518183946609497, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89794921875, - "rewards/tag_count_reward/std": 0.23504318296909332, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.169349804520607, "step": 551 }, { @@ -15994,27 +15994,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.162109375, + "completions/clipped_ratio": 0.134765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 1155.5546875, - "completions/mean_terminated_length": 982.8904418945312, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1194.0, + "completions/mean_terminated_length": 1060.9842529296875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.18844414099172144, - "grad_norm": 12.2529296875, - "kl": 1.177734375, - "learning_rate": 9.787473515981837e-07, - "loss": 0.2535, - "num_tokens": 365464006.0, - "reward": 0.9619140625, - "reward_std": 0.3028673827648163, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, + "grad_norm": 0.5157608985900879, + "kl": 0.099365234375, + "learning_rate": 9.788939778860224e-07, + "loss": 0.1115, + "num_tokens": 394494675.0, + "reward": 1.0224609375, + "reward_std": 0.2601080536842346, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8759765625, - "rewards/tag_count_reward/std": 0.25121819972991943, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.1972527801990509, "step": 552 }, { @@ -16023,27 +16023,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.14453125, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 1006.98046875, - "completions/mean_terminated_length": 831.1004028320312, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 954.28125, + "completions/mean_terminated_length": 891.0082397460938, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.18878552530511222, - "grad_norm": 8.00321102142334, - "kl": 1.1435546875, - "learning_rate": 9.785842384184396e-07, - "loss": 0.2241, - "num_tokens": 366055676.0, - "reward": 1.04931640625, - "reward_std": 0.3497886657714844, - "rewards/accuracy_reward/mean": 0.150390625, - "rewards/accuracy_reward/std": 0.35780346393585205, + "grad_norm": 0.5783326029777527, + "kl": 0.07244873046875, + "learning_rate": 9.787313518781823e-07, + "loss": 0.0764, + "num_tokens": 395059363.0, + "reward": 1.1435546875, + "reward_std": 0.28659093379974365, + "rewards/accuracy_reward/mean": 0.189453125, + "rewards/accuracy_reward/std": 0.3922513723373413, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89892578125, - "rewards/tag_count_reward/std": 0.23337861895561218, + "rewards/tag_count_reward/mean": 0.9541015625, + "rewards/tag_count_reward/std": 0.14535459876060486, "step": 553 }, { @@ -16052,27 +16052,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.19140625, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 1160.857421875, - "completions/mean_terminated_length": 950.8574829101562, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1110.6875, + "completions/mean_terminated_length": 1015.9484252929688, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.18912690961850304, - "grad_norm": 104.50601196289062, - "kl": 3.88671875, - "learning_rate": 9.784205169417582e-07, - "loss": 0.3926, - "num_tokens": 366729459.0, - "reward": 0.94482421875, - "reward_std": 0.3352063000202179, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, + "grad_norm": 0.4482949376106262, + "kl": 0.07421875, + "learning_rate": 9.785681169028283e-07, + "loss": 0.0812, + "num_tokens": 395707459.0, + "reward": 1.06103515625, + "reward_std": 0.2188917100429535, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86083984375, - "rewards/tag_count_reward/std": 0.2700740694999695, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.16157673299312592, "step": 554 }, { @@ -16081,27 +16081,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.142578125, + "completions/clipped_ratio": 0.111328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 1085.642578125, - "completions/mean_terminated_length": 925.6150512695312, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1096.193359375, + "completions/mean_terminated_length": 976.9561157226562, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, "epoch": 0.18946829393189382, - "grad_norm": 27.38459587097168, - "kl": 2.0146484375, - "learning_rate": 9.782561874005121e-07, - "loss": 0.2778, - "num_tokens": 367361580.0, - "reward": 0.9658203125, - "reward_std": 0.29609373211860657, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, + "grad_norm": 0.787347137928009, + "kl": 0.098876953125, + "learning_rate": 9.784042731918182e-07, + "loss": 0.1182, + "num_tokens": 396344982.0, + "reward": 1.05615234375, + "reward_std": 0.27714595198631287, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8876953125, - "rewards/tag_count_reward/std": 0.2373717874288559, + "rewards/tag_count_reward/mean": 0.91162109375, + "rewards/tag_count_reward/std": 0.21244709193706512, "step": 555 }, { @@ -16110,27 +16110,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.16015625, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 1115.583984375, - "completions/mean_terminated_length": 937.7744140625, - "completions/min_length": 28.0, - "completions/min_terminated_length": 28.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1110.333984375, + "completions/mean_terminated_length": 1026.54248046875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.18980967824528464, - "grad_norm": 10.092026710510254, - "kl": 1.404296875, - "learning_rate": 9.78091250027937e-07, - "loss": 0.31, - "num_tokens": 368012039.0, - "reward": 0.9296875, - "reward_std": 0.29066091775894165, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, + "grad_norm": 0.2713336944580078, + "kl": 0.0731201171875, + "learning_rate": 9.782398209778744e-07, + "loss": 0.0739, + "num_tokens": 396992753.0, + "reward": 1.01171875, + "reward_std": 0.23044559359550476, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87890625, - "rewards/tag_count_reward/std": 0.24874316155910492, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.1586231142282486, "step": 556 }, { @@ -16139,27 +16139,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.212890625, + "completions/clipped_ratio": 0.169921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 1170.2734375, - "completions/mean_terminated_length": 932.8734130859375, - "completions/min_length": 233.0, - "completions/min_terminated_length": 233.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1158.2421875, + "completions/mean_terminated_length": 976.103515625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.19015106255867542, - "grad_norm": 9.27359390258789, - "kl": 1.453125, - "learning_rate": 9.779257050581316e-07, - "loss": 0.2855, - "num_tokens": 368685555.0, - "reward": 0.95361328125, - "reward_std": 0.3154717683792114, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, + "grad_norm": 25.266206741333008, + "kl": 0.24407958984375, + "learning_rate": 9.78074760494584e-07, + "loss": 0.0873, + "num_tokens": 397660109.0, + "reward": 1.04052734375, + "reward_std": 0.28423190116882324, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86181640625, - "rewards/tag_count_reward/std": 0.2576083242893219, + "rewards/tag_count_reward/mean": 0.89794921875, + "rewards/tag_count_reward/std": 0.22110001742839813, "step": 557 }, { @@ -16168,27 +16168,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2734375, + "completions/clipped_ratio": 0.11328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1283.447265625, - "completions/mean_terminated_length": 995.7123413085938, - "completions/min_length": 264.0, - "completions/min_terminated_length": 264.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1219.36328125, + "completions/mean_terminated_length": 1113.502197265625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.19049244687206623, - "grad_norm": 14.146623611450195, - "kl": 1.919921875, - "learning_rate": 9.777595527260567e-07, - "loss": 0.3542, - "num_tokens": 369419384.0, - "reward": 0.84765625, - "reward_std": 0.32374101877212524, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, + "grad_norm": 3.4805595874786377, + "kl": 0.095458984375, + "learning_rate": 9.77909091976398e-07, + "loss": 0.0796, + "num_tokens": 398361127.0, + "reward": 1.00439453125, + "reward_std": 0.24572907388210297, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8046875, - "rewards/tag_count_reward/std": 0.2987048029899597, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.18900687992572784, "step": 558 }, { @@ -16197,27 +16197,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.19921875, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 1135.689453125, - "completions/mean_terminated_length": 908.724365234375, - "completions/min_length": 271.0, - "completions/min_terminated_length": 271.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1084.763671875, + "completions/mean_terminated_length": 1026.9295654296875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.19083383118545702, - "grad_norm": 10.603168487548828, - "kl": 2.337890625, - "learning_rate": 9.77592793267535e-07, - "loss": 0.4041, - "num_tokens": 370075289.0, - "reward": 0.880859375, - "reward_std": 0.2896808087825775, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, + "grad_norm": 0.35506051778793335, + "kl": 0.0626220703125, + "learning_rate": 9.77742815658631e-07, + "loss": 0.0797, + "num_tokens": 398990958.0, + "reward": 1.015625, + "reward_std": 0.19875165820121765, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.853515625, - "rewards/tag_count_reward/std": 0.27229785919189453, + "rewards/tag_count_reward/mean": 0.951171875, + "rewards/tag_count_reward/std": 0.14856980741024017, "step": 559 }, { @@ -16226,27 +16226,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.193359375, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1923.0, - "completions/mean_length": 1135.6640625, - "completions/mean_terminated_length": 916.9685668945312, - "completions/min_length": 235.0, - "completions/min_terminated_length": 235.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1104.12109375, + "completions/mean_terminated_length": 1026.2960205078125, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, "epoch": 0.19117521549884783, - "grad_norm": 437.899169921875, - "kl": 9.3515625, - "learning_rate": 9.774254269192506e-07, - "loss": 0.6456, - "num_tokens": 370730173.0, - "reward": 0.98486328125, - "reward_std": 0.371319979429245, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, + "grad_norm": 0.5398379564285278, + "kl": 0.066650390625, + "learning_rate": 9.775759317774608e-07, + "loss": 0.0758, + "num_tokens": 399629692.0, + "reward": 1.119140625, + "reward_std": 0.2749728262424469, + "rewards/accuracy_reward/mean": 0.18359375, + "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85400390625, - "rewards/tag_count_reward/std": 0.26643362641334534, + "rewards/tag_count_reward/mean": 0.935546875, + "rewards/tag_count_reward/std": 0.17968250811100006, "step": 560 }, { @@ -16255,27 +16255,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.20703125, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 1163.13671875, - "completions/mean_terminated_length": 932.11328125, - "completions/min_length": 309.0, - "completions/min_terminated_length": 309.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1147.884765625, + "completions/mean_terminated_length": 1059.0322265625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.19151659981223862, - "grad_norm": 67.19200134277344, - "kl": 3.98046875, - "learning_rate": 9.772574539187503e-07, - "loss": 0.4641, - "num_tokens": 371398755.0, - "reward": 0.94287109375, - "reward_std": 0.38048410415649414, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, + "grad_norm": 0.42627501487731934, + "kl": 0.05914306640625, + "learning_rate": 9.774084405699285e-07, + "loss": 0.081, + "num_tokens": 400290465.0, + "reward": 1.04296875, + "reward_std": 0.2823117971420288, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84130859375, - "rewards/tag_count_reward/std": 0.28297698497772217, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.19436629116535187, "step": 561 }, { @@ -16284,27 +16284,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.271484375, + "completions/clipped_ratio": 0.119140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 1219.1875, - "completions/mean_terminated_length": 910.3270874023438, - "completions/min_length": 233.0, - "completions/min_terminated_length": 233.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1150.671875, + "completions/mean_terminated_length": 1029.3037109375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.19185798412562943, - "grad_norm": 13.20149040222168, - "kl": 2.9609375, - "learning_rate": 9.770888745044405e-07, - "loss": 0.4152, - "num_tokens": 372104707.0, - "reward": 0.892578125, - "reward_std": 0.3713717460632324, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, + "grad_norm": 0.3655388653278351, + "kl": 0.05712890625, + "learning_rate": 9.772403422739374e-07, + "loss": 0.0923, + "num_tokens": 400961337.0, + "reward": 1.048828125, + "reward_std": 0.27918076515197754, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80078125, - "rewards/tag_count_reward/std": 0.3082514703273773, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.20592933893203735, "step": 562 }, { @@ -16313,27 +16313,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.212890625, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 1118.677734375, - "completions/mean_terminated_length": 867.3225708007812, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1117.5390625, + "completions/mean_terminated_length": 989.3422241210938, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, "epoch": 0.19219936843902022, - "grad_norm": 40.9189453125, - "kl": 2.916015625, - "learning_rate": 9.769196889155888e-07, - "loss": 0.3503, - "num_tokens": 372752974.0, - "reward": 0.91259765625, - "reward_std": 0.2583797872066498, - "rewards/accuracy_reward/mean": 0.07459677755832672, - "rewards/accuracy_reward/std": 0.263004869222641, + "grad_norm": 0.37263110280036926, + "kl": 0.0657958984375, + "learning_rate": 9.770716371282538e-07, + "loss": 0.0957, + "num_tokens": 401609021.0, + "reward": 0.97119140625, + "reward_std": 0.23943334817886353, + "rewards/accuracy_reward/mean": 0.06854838877916336, + "rewards/accuracy_reward/std": 0.25293970108032227, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84033203125, - "rewards/tag_count_reward/std": 0.284153014421463, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.21579405665397644, "step": 563 }, { @@ -16342,27 +16342,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.279296875, + "completions/clipped_ratio": 0.14453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 1222.16015625, - "completions/mean_terminated_length": 902.1192626953125, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1173.470703125, + "completions/mean_terminated_length": 1025.7191162109375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.19254075275241103, - "grad_norm": 27.644025802612305, - "kl": 2.75390625, - "learning_rate": 9.767498973923236e-07, - "loss": 0.4167, - "num_tokens": 373465888.0, - "reward": 0.92431640625, - "reward_std": 0.3687335252761841, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, + "grad_norm": 0.2630736231803894, + "kl": 0.06439208984375, + "learning_rate": 9.769023253725047e-07, + "loss": 0.08, + "num_tokens": 402297006.0, + "reward": 1.0703125, + "reward_std": 0.2503066062927246, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80126953125, - "rewards/tag_count_reward/std": 0.3087652325630188, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.21157780289649963, "step": 564 }, { @@ -16371,27 +16371,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.236328125, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1968.0, - "completions/mean_length": 1258.4296875, - "completions/mean_terminated_length": 1014.0869750976562, - "completions/min_length": 300.0, - "completions/min_terminated_length": 300.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1212.0546875, + "completions/mean_terminated_length": 1103.1788330078125, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, "epoch": 0.19288213706580182, - "grad_norm": 13.675626754760742, - "kl": 2.2109375, - "learning_rate": 9.765795001756326e-07, - "loss": 0.3625, - "num_tokens": 374184876.0, - "reward": 0.91748046875, - "reward_std": 0.359503835439682, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.82568359375, - "rewards/tag_count_reward/std": 0.28755927085876465, + "grad_norm": 0.34691181778907776, + "kl": 0.0675048828125, + "learning_rate": 9.767324072471803e-07, + "loss": 0.1125, + "num_tokens": 402992250.0, + "reward": 1.05224609375, + "reward_std": 0.2543666660785675, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.19716253876686096, "step": 565 }, { @@ -16400,27 +16400,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.17578125, + "completions/clipped_ratio": 0.119140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1161.28125, - "completions/mean_terminated_length": 972.170654296875, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1192.375, + "completions/mean_terminated_length": 1076.6474609375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.19322352137919263, - "grad_norm": 13.544975280761719, - "kl": 1.93359375, - "learning_rate": 9.764084975073635e-07, - "loss": 0.3726, - "num_tokens": 374861788.0, - "reward": 0.9453125, - "reward_std": 0.3162229359149933, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, + "grad_norm": 0.6449598073959351, + "kl": 0.07659912109375, + "learning_rate": 9.76561882993631e-07, + "loss": 0.1298, + "num_tokens": 403685082.0, + "reward": 1.0283203125, + "reward_std": 0.29140976071357727, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.865234375, - "rewards/tag_count_reward/std": 0.263393372297287, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.20515529811382294, "step": 566 }, { @@ -16429,27 +16429,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.21875, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 1153.486328125, - "completions/mean_terminated_length": 903.0224609375, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1077.818359375, + "completions/mean_terminated_length": 982.0493774414062, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.19356490569258342, - "grad_norm": 16.100940704345703, - "kl": 1.78125, - "learning_rate": 9.762368896302234e-07, - "loss": 0.4092, - "num_tokens": 375536773.0, - "reward": 0.90087890625, - "reward_std": 0.3287081718444824, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 0.7065788507461548, + "kl": 0.07843017578125, + "learning_rate": 9.763907528540684e-07, + "loss": 0.1166, + "num_tokens": 404321325.0, + "reward": 1.025390625, + "reward_std": 0.24978119134902954, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83447265625, - "rewards/tag_count_reward/std": 0.2876589298248291, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.18686699867248535, "step": 567 }, { @@ -16458,27 +16458,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.201171875, + "completions/clipped_ratio": 0.111328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1151.13671875, - "completions/mean_terminated_length": 925.2763061523438, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1140.861328125, + "completions/mean_terminated_length": 1027.2198486328125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.19390629000597423, - "grad_norm": 108.90221405029297, - "kl": 4.123046875, - "learning_rate": 9.760646767877784e-07, - "loss": 0.3934, - "num_tokens": 376199451.0, - "reward": 0.9091796875, - "reward_std": 0.3188011348247528, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, + "grad_norm": 0.5742402672767639, + "kl": 0.1163330078125, + "learning_rate": 9.762190170715649e-07, + "loss": 0.086, + "num_tokens": 404978742.0, + "reward": 1.072265625, + "reward_std": 0.26042798161506653, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8447265625, - "rewards/tag_count_reward/std": 0.28336378931999207, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.18861782550811768, "step": 568 }, { @@ -16487,27 +16487,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2265625, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 1247.072265625, - "completions/mean_terminated_length": 1012.4570922851562, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1228.419921875, + "completions/mean_terminated_length": 1119.6260986328125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.19424767431936502, - "grad_norm": 6.915520668029785, - "kl": 1.462890625, - "learning_rate": 9.758918592244528e-07, - "loss": 0.2869, - "num_tokens": 376910864.0, - "reward": 0.9052734375, - "reward_std": 0.31668218970298767, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, + "grad_norm": 0.48563316464424133, + "kl": 0.1265869140625, + "learning_rate": 9.760466758900526e-07, + "loss": 0.095, + "num_tokens": 405680605.0, + "reward": 1.01953125, + "reward_std": 0.2674994468688965, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8212890625, - "rewards/tag_count_reward/std": 0.29724088311195374, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.20338943600654602, "step": 569 }, { @@ -16516,27 +16516,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.244140625, + "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1875.0, - "completions/mean_length": 1207.375, - "completions/mean_terminated_length": 935.8553466796875, - "completions/min_length": 295.0, - "completions/min_terminated_length": 295.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1154.271484375, + "completions/mean_terminated_length": 1053.2412109375, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, "epoch": 0.19458905863275583, - "grad_norm": 6.648573875427246, - "kl": 1.451171875, - "learning_rate": 9.757184371855298e-07, - "loss": 0.3194, - "num_tokens": 377602336.0, - "reward": 0.89208984375, - "reward_std": 0.34159648418426514, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.0, - "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80615234375, - "rewards/tag_count_reward/std": 0.3091023564338684, + "grad_norm": 0.8530169129371643, + "kl": 0.145751953125, + "learning_rate": 9.758737295543246e-07, + "loss": 0.1397, + "num_tokens": 406344888.0, + "reward": 1.02001953125, + "reward_std": 0.26721084117889404, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.92236328125, + "rewards/tag_count_reward/std": 0.19709952175617218, "step": 570 }, { @@ -16545,27 +16545,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.236328125, + "completions/clipped_ratio": 0.103515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 1252.3671875, - "completions/mean_terminated_length": 1006.1483154296875, - "completions/min_length": 255.0, - "completions/min_terminated_length": 255.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1167.205078125, + "completions/mean_terminated_length": 1065.5010986328125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.19493044294614661, - "grad_norm": 5.759603500366211, - "kl": 1.298828125, - "learning_rate": 9.7554441091715e-07, - "loss": 0.2955, - "num_tokens": 378328108.0, - "reward": 0.8935546875, - "reward_std": 0.35490304231643677, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, + "grad_norm": 2.242124080657959, + "kl": 0.24560546875, + "learning_rate": 9.757001783100323e-07, + "loss": 0.0968, + "num_tokens": 407027057.0, + "reward": 1.0234375, + "reward_std": 0.227874293923378, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8173828125, - "rewards/tag_count_reward/std": 0.303035169839859, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.19409078359603882, "step": 571 }, { @@ -16574,27 +16574,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1953125, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1947.0, - "completions/mean_length": 1162.66796875, - "completions/mean_terminated_length": 947.7815551757812, - "completions/min_length": 269.0, - "completions/min_terminated_length": 269.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1108.5390625, + "completions/mean_terminated_length": 995.474853515625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.19527182725953743, - "grad_norm": 12.158112525939941, - "kl": 1.578125, - "learning_rate": 9.753697806663124e-07, - "loss": 0.2999, - "num_tokens": 379000018.0, - "reward": 0.9765625, - "reward_std": 0.3368375897407532, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, + "grad_norm": 2.936920166015625, + "kl": 0.2293701171875, + "learning_rate": 9.755260224036872e-07, + "loss": 0.1002, + "num_tokens": 407671253.0, + "reward": 1.05615234375, + "reward_std": 0.2248762547969818, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84765625, - "rewards/tag_count_reward/std": 0.2810630798339844, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.17357957363128662, "step": 572 }, { @@ -16603,27 +16603,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2109375, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 1231.03515625, - "completions/mean_terminated_length": 1012.6386108398438, - "completions/min_length": 233.0, - "completions/min_terminated_length": 233.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1145.361328125, + "completions/mean_terminated_length": 1070.9365234375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.19561321157292821, - "grad_norm": 4.52598762512207, - "kl": 1.0849609375, - "learning_rate": 9.75194546680872e-07, - "loss": 0.2984, - "num_tokens": 379706836.0, - "reward": 0.88427734375, - "reward_std": 0.2944050431251526, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, + "grad_norm": 0.45513296127319336, + "kl": 0.1466064453125, + "learning_rate": 9.753512620826592e-07, + "loss": 0.1024, + "num_tokens": 408334206.0, + "reward": 1.00927734375, + "reward_std": 0.17300641536712646, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84130859375, - "rewards/tag_count_reward/std": 0.2821112275123596, + "rewards/tag_count_reward/mean": 0.95263671875, + "rewards/tag_count_reward/std": 0.1418975442647934, "step": 573 }, { @@ -16632,27 +16632,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.27734375, + "completions/clipped_ratio": 0.095703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 1284.548828125, - "completions/mean_terminated_length": 991.5486450195312, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1160.25, + "completions/mean_terminated_length": 1066.2979736328125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.19595459588631903, - "grad_norm": 5.709986686706543, - "kl": 1.51953125, - "learning_rate": 9.750187092095422e-07, - "loss": 0.3602, - "num_tokens": 380441357.0, - "reward": 0.84228515625, - "reward_std": 0.31777364015579224, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, + "grad_norm": 11.788080215454102, + "kl": 0.33154296875, + "learning_rate": 9.751758975951767e-07, + "loss": 0.09, + "num_tokens": 409005086.0, + "reward": 1.015625, + "reward_std": 0.22155682742595673, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80322265625, - "rewards/tag_count_reward/std": 0.3060453534126282, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.15556970238685608, "step": 574 }, { @@ -16661,27 +16661,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.23046875, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 1172.1015625, - "completions/mean_terminated_length": 909.776611328125, - "completions/min_length": 217.0, - "completions/min_terminated_length": 217.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1078.783203125, + "completions/mean_terminated_length": 994.4140625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.1962959801997098, - "grad_norm": 18.69489860534668, - "kl": 1.7578125, - "learning_rate": 9.748422685018911e-07, - "loss": 0.3303, - "num_tokens": 381115553.0, - "reward": 1.0009765625, - "reward_std": 0.39446723461151123, - "rewards/accuracy_reward/mean": 0.15625, - "rewards/accuracy_reward/std": 0.36344730854034424, + "grad_norm": 0.4927197992801666, + "kl": 0.150634765625, + "learning_rate": 9.749999291903267e-07, + "loss": 0.0712, + "num_tokens": 409631503.0, + "reward": 1.12158203125, + "reward_std": 0.28047245740890503, + "rewards/accuracy_reward/mean": 0.177734375, + "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8447265625, - "rewards/tag_count_reward/std": 0.28032606840133667, + "rewards/tag_count_reward/mean": 0.94384765625, + "rewards/tag_count_reward/std": 0.16372689604759216, "step": 575 }, { @@ -16690,27 +16690,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2421875, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1263.09765625, - "completions/mean_terminated_length": 1012.2525634765625, - "completions/min_length": 228.0, - "completions/min_terminated_length": 228.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1155.580078125, + "completions/mean_terminated_length": 1084.0357666015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, "epoch": 0.19663736451310063, - "grad_norm": 17.96137046813965, - "kl": 1.458984375, - "learning_rate": 9.74665224808345e-07, - "loss": 0.29, - "num_tokens": 381847219.0, - "reward": 0.8447265625, - "reward_std": 0.28833621740341187, - "rewards/accuracy_reward/mean": 0.02822580561041832, - "rewards/accuracy_reward/std": 0.1657845675945282, - "rewards/format_reward/mean": 0.0, - "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8173828125, - "rewards/tag_count_reward/std": 0.2989718019962311, + "grad_norm": 0.2798035442829132, + "kl": 0.1241455078125, + "learning_rate": 9.748233571180536e-07, + "loss": 0.0676, + "num_tokens": 410308120.0, + "reward": 0.9853515625, + "reward_std": 0.1960594654083252, + "rewards/accuracy_reward/mean": 0.04233871027827263, + "rewards/accuracy_reward/std": 0.2015640139579773, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.16201746463775635, "step": 576 }, { @@ -16719,27 +16719,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2109375, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 1207.884765625, - "completions/mean_terminated_length": 983.2994995117188, - "completions/min_length": 277.0, - "completions/min_terminated_length": 277.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1098.4375, + "completions/mean_terminated_length": 1035.1334228515625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.1969787488264914, - "grad_norm": 6.886361598968506, - "kl": 1.271484375, - "learning_rate": 9.744875783801844e-07, - "loss": 0.2947, - "num_tokens": 382545448.0, - "reward": 0.9453125, - "reward_std": 0.34676775336265564, + "grad_norm": 4.284827709197998, + "kl": 0.2630615234375, + "learning_rate": 9.74646181629159e-07, + "loss": 0.0988, + "num_tokens": 410950312.0, + "reward": 1.0498046875, + "reward_std": 0.2191469520330429, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84765625, - "rewards/tag_count_reward/std": 0.27843984961509705, + "rewards/tag_count_reward/mean": 0.9521484375, + "rewards/tag_count_reward/std": 0.15052182972431183, "step": 577 }, { @@ -16748,27 +16748,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.171875, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 1181.689453125, - "completions/mean_terminated_length": 1001.88916015625, - "completions/min_length": 216.0, - "completions/min_terminated_length": 216.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1094.064453125, + "completions/mean_terminated_length": 1028.344482421875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.19732013313988223, - "grad_norm": 10.539137840270996, - "kl": 1.26171875, - "learning_rate": 9.743093294695461e-07, - "loss": 0.2806, - "num_tokens": 383226905.0, - "reward": 0.94677734375, - "reward_std": 0.33402007818222046, - "rewards/accuracy_reward/mean": 0.08064515888690948, - "rewards/accuracy_reward/std": 0.2725643217563629, + "grad_norm": 83.3451919555664, + "kl": 1.0875244140625, + "learning_rate": 9.744684029753026e-07, + "loss": 0.1079, + "num_tokens": 411586905.0, + "reward": 1.0888671875, + "reward_std": 0.24662557244300842, + "rewards/accuracy_reward/mean": 0.14516128599643707, + "rewards/accuracy_reward/std": 0.3526190221309662, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86865234375, - "rewards/tag_count_reward/std": 0.2667275369167328, + "rewards/tag_count_reward/mean": 0.9482421875, + "rewards/tag_count_reward/std": 0.15085157752037048, "step": 578 }, { @@ -16777,27 +16777,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.173828125, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 1234.08203125, - "completions/mean_terminated_length": 1062.8321533203125, - "completions/min_length": 238.0, - "completions/min_terminated_length": 238.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1133.701171875, + "completions/mean_terminated_length": 1058.31494140625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.197661517453273, - "grad_norm": 6.447632312774658, - "kl": 0.826171875, - "learning_rate": 9.741304783294218e-07, - "loss": 0.2191, - "num_tokens": 383938131.0, - "reward": 0.96484375, - "reward_std": 0.272844135761261, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, + "grad_norm": 0.9344030022621155, + "kl": 0.0947265625, + "learning_rate": 9.742900214089994e-07, + "loss": 0.0975, + "num_tokens": 412246736.0, + "reward": 1.0185546875, + "reward_std": 0.2350711077451706, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.876953125, - "rewards/tag_count_reward/std": 0.2507251501083374, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.18318742513656616, "step": 579 }, { @@ -16806,27 +16806,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1177.27734375, - "completions/mean_terminated_length": 1052.888427734375, - "completions/min_length": 271.0, - "completions/min_terminated_length": 271.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1107.046875, + "completions/mean_terminated_length": 1054.6639404296875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.19800290176666382, - "grad_norm": 8.025528907775879, - "kl": 0.7685546875, - "learning_rate": 9.739510252136584e-07, - "loss": 0.2053, - "num_tokens": 384619873.0, - "reward": 1.0771484375, - "reward_std": 0.3427858352661133, - "rewards/accuracy_reward/mean": 0.16796875, - "rewards/accuracy_reward/std": 0.374204158782959, + "grad_norm": 0.5140022039413452, + "kl": 0.0635986328125, + "learning_rate": 9.741110371836224e-07, + "loss": 0.0381, + "num_tokens": 412892520.0, + "reward": 1.1103515625, + "reward_std": 0.2751343846321106, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.21964147686958313, + "rewards/tag_count_reward/mean": 0.9462890625, + "rewards/tag_count_reward/std": 0.15964092314243317, "step": 580 }, { @@ -16835,27 +16835,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.13671875, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1070.193359375, - "completions/mean_terminated_length": 915.337158203125, - "completions/min_length": 228.0, - "completions/min_terminated_length": 228.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1048.87109375, + "completions/mean_terminated_length": 975.5596923828125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.1983442860800546, - "grad_norm": 7.305599212646484, - "kl": 0.5322265625, - "learning_rate": 9.737709703769562e-07, - "loss": 0.1573, - "num_tokens": 385249684.0, - "reward": 0.982421875, - "reward_std": 0.26694631576538086, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, + "grad_norm": 0.34006255865097046, + "kl": 0.06719970703125, + "learning_rate": 9.739314505533989e-07, + "loss": 0.0814, + "num_tokens": 413511414.0, + "reward": 1.01025390625, + "reward_std": 0.22366735339164734, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90234375, - "rewards/tag_count_reward/std": 0.23194991052150726, + "rewards/tag_count_reward/mean": 0.94580078125, + "rewards/tag_count_reward/std": 0.16363932192325592, "step": 581 }, { @@ -16864,27 +16864,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.13671875, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1132.24609375, - "completions/mean_terminated_length": 987.2172241210938, - "completions/min_length": 271.0, - "completions/min_terminated_length": 271.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1029.501953125, + "completions/mean_terminated_length": 968.3499145507812, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.19868567039344542, - "grad_norm": 8.776261329650879, - "kl": 0.431640625, - "learning_rate": 9.735903140748702e-07, - "loss": 0.2056, - "num_tokens": 385915458.0, - "reward": 1.03662109375, - "reward_std": 0.29240643978118896, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, + "grad_norm": 0.2730991542339325, + "kl": 0.064208984375, + "learning_rate": 9.73751261773413e-07, + "loss": 0.0427, + "num_tokens": 414124583.0, + "reward": 1.08056640625, + "reward_std": 0.2226194590330124, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90380859375, - "rewards/tag_count_reward/std": 0.22535066306591034, + "rewards/tag_count_reward/mean": 0.95361328125, + "rewards/tag_count_reward/std": 0.13961657881736755, "step": 582 }, { @@ -16893,27 +16893,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.099609375, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 1057.48046875, - "completions/mean_terminated_length": 947.9002075195312, - "completions/min_length": 235.0, - "completions/min_terminated_length": 235.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 1001.361328125, + "completions/mean_terminated_length": 945.3682861328125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.1990270547068362, - "grad_norm": 8.846368789672852, - "kl": 0.49609375, - "learning_rate": 9.734090565638092e-07, - "loss": 0.1636, - "num_tokens": 386528280.0, - "reward": 1.041015625, - "reward_std": 0.28835421800613403, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, + "grad_norm": 0.39661121368408203, + "kl": 0.077880859375, + "learning_rate": 9.735704710996043e-07, + "loss": 0.0598, + "num_tokens": 414708672.0, + "reward": 1.08154296875, + "reward_std": 0.26310551166534424, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.1969740390777588, + "rewards/tag_count_reward/mean": 0.95849609375, + "rewards/tag_count_reward/std": 0.1445726603269577, "step": 583 }, { @@ -16922,27 +16922,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.130859375, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 1148.72265625, - "completions/mean_terminated_length": 1013.3258666992188, - "completions/min_length": 336.0, - "completions/min_terminated_length": 336.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 1051.6171875, + "completions/mean_terminated_length": 991.79296875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.19936843902022702, - "grad_norm": 3.6344919204711914, - "kl": 0.74609375, - "learning_rate": 9.73227198101035e-07, - "loss": 0.2193, - "num_tokens": 387198378.0, - "reward": 0.97216796875, - "reward_std": 0.25956839323043823, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 0.29163286089897156, + "kl": 0.07293701171875, + "learning_rate": 9.73389078788766e-07, + "loss": 0.0647, + "num_tokens": 415329052.0, + "reward": 1.0439453125, + "reward_std": 0.198550745844841, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90576171875, - "rewards/tag_count_reward/std": 0.2195909023284912, + "rewards/tag_count_reward/mean": 0.9541015625, + "rewards/tag_count_reward/std": 0.1419488787651062, "step": 584 }, { @@ -16951,27 +16951,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.11328125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 989.21875, - "completions/mean_terminated_length": 853.9559326171875, - "completions/min_length": 297.0, - "completions/min_terminated_length": 297.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 905.3515625, + "completions/mean_terminated_length": 856.480712890625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.1997098233336178, - "grad_norm": 7.630433082580566, - "kl": 0.58349609375, - "learning_rate": 9.730447389446623e-07, - "loss": 0.1664, - "num_tokens": 387774170.0, - "reward": 1.05322265625, - "reward_std": 0.28049057722091675, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, + "grad_norm": 0.281498521566391, + "kl": 0.0682373046875, + "learning_rate": 9.732070850985472e-07, + "loss": 0.0499, + "num_tokens": 415861904.0, + "reward": 1.1123046875, + "reward_std": 0.23264080286026, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.19157785177230835, + "rewards/tag_count_reward/mean": 0.9677734375, + "rewards/tag_count_reward/std": 0.11466160416603088, "step": 585 }, { @@ -16980,27 +16980,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1640625, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1135.650390625, - "completions/mean_terminated_length": 956.591064453125, - "completions/min_length": 211.0, - "completions/min_terminated_length": 211.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1053.068359375, + "completions/mean_terminated_length": 995.5103149414062, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.20005120764700862, - "grad_norm": 43.59455871582031, - "kl": 1.6923828125, - "learning_rate": 9.728616793536587e-07, - "loss": 0.2915, - "num_tokens": 388433607.0, - "reward": 0.939453125, - "reward_std": 0.3105131983757019, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 0.42289862036705017, + "kl": 0.085205078125, + "learning_rate": 9.730244902874507e-07, + "loss": 0.0564, + "num_tokens": 416479059.0, + "reward": 1.06005859375, + "reward_std": 0.22527457773685455, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.873046875, - "rewards/tag_count_reward/std": 0.26217159628868103, + "rewards/tag_count_reward/mean": 0.95849609375, + "rewards/tag_count_reward/std": 0.14201197028160095, "step": 586 }, { @@ -17009,27 +17009,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1875, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1983.0, - "completions/mean_length": 1157.76953125, - "completions/mean_terminated_length": 952.331787109375, - "completions/min_length": 259.0, - "completions/min_terminated_length": 259.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1084.958984375, + "completions/mean_terminated_length": 1003.3453369140625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, "epoch": 0.2003925919603994, - "grad_norm": 2.1202924251556396, - "kl": 0.60986328125, - "learning_rate": 9.726780195878438e-07, - "loss": 0.2669, - "num_tokens": 389108625.0, - "reward": 0.93701171875, - "reward_std": 0.3265402317047119, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, + "grad_norm": 1.545763373374939, + "kl": 0.1494140625, + "learning_rate": 9.728412946148327e-07, + "loss": 0.0919, + "num_tokens": 417116798.0, + "reward": 1.0390625, + "reward_std": 0.2163223922252655, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85498046875, - "rewards/tag_count_reward/std": 0.2777453064918518, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.15037257969379425, "step": 587 }, { @@ -17038,27 +17038,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.15625, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 1094.56640625, - "completions/mean_terminated_length": 918.004638671875, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1020.591796875, + "completions/mean_terminated_length": 967.8501586914062, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.20073397627379022, - "grad_norm": 6.691629409790039, - "kl": 0.939453125, - "learning_rate": 9.724937599078888e-07, - "loss": 0.2613, - "num_tokens": 389744771.0, - "reward": 0.97021484375, - "reward_std": 0.32121267914772034, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, + "grad_norm": 0.5551316142082214, + "kl": 0.1324462890625, + "learning_rate": 9.726574983409039e-07, + "loss": 0.0943, + "num_tokens": 417715069.0, + "reward": 1.0673828125, + "reward_std": 0.21322165429592133, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87255859375, - "rewards/tag_count_reward/std": 0.2663331925868988, + "rewards/tag_count_reward/mean": 0.9599609375, + "rewards/tag_count_reward/std": 0.14113877713680267, "step": 588 }, { @@ -17067,27 +17067,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.208984375, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 1166.4765625, - "completions/mean_terminated_length": 933.5802612304688, - "completions/min_length": 211.0, - "completions/min_terminated_length": 211.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1062.693359375, + "completions/mean_terminated_length": 979.1928100585938, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.201075360587181, - "grad_norm": 6.024787902832031, - "kl": 0.8759765625, - "learning_rate": 9.72308900575317e-07, - "loss": 0.2851, - "num_tokens": 390420055.0, - "reward": 0.9619140625, - "reward_std": 0.3914545178413391, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310528099536896, + "grad_norm": 661676.5625, + "kl": 4672.11572265625, + "learning_rate": 9.724731017267267e-07, + "loss": 186.9552, + "num_tokens": 418337216.0, + "reward": 1.12353515625, + "reward_std": 0.3131940960884094, + "rewards/accuracy_reward/mean": 0.19959677755832672, + "rewards/accuracy_reward/std": 0.40010079741477966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8408203125, - "rewards/tag_count_reward/std": 0.2897527813911438, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.18932512402534485, "step": 589 }, { @@ -17096,27 +17096,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.142578125, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 1046.951171875, - "completions/mean_terminated_length": 880.4898071289062, - "completions/min_length": 232.0, - "completions/min_terminated_length": 232.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1016.14453125, + "completions/mean_terminated_length": 938.1051025390625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, "epoch": 0.20141674490057182, - "grad_norm": 4.769101142883301, - "kl": 0.75390625, - "learning_rate": 9.72123441852502e-07, - "loss": 0.2474, - "num_tokens": 391023566.0, - "reward": 1.03515625, - "reward_std": 0.34677445888519287, - "rewards/accuracy_reward/mean": 0.150390625, - "rewards/accuracy_reward/std": 0.35780346393585205, + "grad_norm": 112.77184295654297, + "kl": 1.376220703125, + "learning_rate": 9.722881050342175e-07, + "loss": 0.14, + "num_tokens": 418924954.0, + "reward": 1.09814453125, + "reward_std": 0.2744031548500061, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.884765625, - "rewards/tag_count_reward/std": 0.2544175982475281, + "rewards/tag_count_reward/mean": 0.94384765625, + "rewards/tag_count_reward/std": 0.1666882485151291, "step": 590 }, { @@ -17125,27 +17125,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.15234375, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 1102.0078125, - "completions/mean_terminated_length": 931.9907836914062, - "completions/min_length": 275.0, - "completions/min_terminated_length": 275.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1049.443359375, + "completions/mean_terminated_length": 982.8729858398438, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.2017581292139626, - "grad_norm": 2.9391894340515137, - "kl": 0.791015625, - "learning_rate": 9.719373840026686e-07, - "loss": 0.2679, - "num_tokens": 391669458.0, - "reward": 0.978515625, - "reward_std": 0.29991257190704346, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, + "grad_norm": 10.41447925567627, + "kl": 0.300048828125, + "learning_rate": 9.721025085261442e-07, + "loss": 0.1252, + "num_tokens": 419543933.0, + "reward": 1.0625, + "reward_std": 0.21535976231098175, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87890625, - "rewards/tag_count_reward/std": 0.2626158595085144, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.1483767330646515, "step": 591 }, { @@ -17154,27 +17154,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.123046875, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 1028.095703125, - "completions/mean_terminated_length": 884.9910888671875, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1055.271484375, + "completions/mean_terminated_length": 966.5595703125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, "epoch": 0.20209951352735342, - "grad_norm": 1.3754154443740845, - "kl": 0.55029296875, - "learning_rate": 9.717507272898922e-07, - "loss": 0.2428, - "num_tokens": 392269523.0, - "reward": 1.03662109375, - "reward_std": 0.3236616253852844, - "rewards/accuracy_reward/mean": 0.13671875, - "rewards/accuracy_reward/std": 0.3438861668109894, + "grad_norm": 1.1468920707702637, + "kl": 0.21044921875, + "learning_rate": 9.719163124661276e-07, + "loss": 0.0996, + "num_tokens": 420157912.0, + "reward": 1.0859375, + "reward_std": 0.24691808223724365, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89990234375, - "rewards/tag_count_reward/std": 0.24050459265708923, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.17556172609329224, "step": 592 }, { @@ -17183,27 +17183,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12109375, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 1076.001953125, - "completions/mean_terminated_length": 942.082275390625, - "completions/min_length": 124.0, - "completions/min_terminated_length": 124.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1081.751953125, + "completions/mean_terminated_length": 1010.8532104492188, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, "epoch": 0.2024408978407442, - "grad_norm": 3.039691686630249, - "kl": 0.48388671875, - "learning_rate": 9.715634719790978e-07, - "loss": 0.1811, - "num_tokens": 392902180.0, - "reward": 0.990234375, - "reward_std": 0.2621752619743347, - "rewards/accuracy_reward/mean": 0.08669354766607285, - "rewards/accuracy_reward/std": 0.281669557094574, + "grad_norm": 40.83711242675781, + "kl": 0.5858154296875, + "learning_rate": 9.717295171186388e-07, + "loss": 0.0852, + "num_tokens": 420793513.0, + "reward": 1.060546875, + "reward_std": 0.19900736212730408, + "rewards/accuracy_reward/mean": 0.11491935700178146, + "rewards/accuracy_reward/std": 0.3192465901374817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.23198285698890686, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.1495569944381714, "step": 593 }, { @@ -17212,27 +17212,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1171875, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 975.173828125, - "completions/mean_terminated_length": 832.7632446289062, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 977.927734375, + "completions/mean_terminated_length": 908.9625854492188, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.20278228215413502, - "grad_norm": 1.9965825080871582, - "kl": 0.48779296875, - "learning_rate": 9.713756183360597e-07, - "loss": 0.2001, - "num_tokens": 393472061.0, - "reward": 1.07666015625, - "reward_std": 0.3201131820678711, - "rewards/accuracy_reward/mean": 0.169921875, - "rewards/accuracy_reward/std": 0.3759314715862274, + "grad_norm": 0.5450801849365234, + "kl": 0.142578125, + "learning_rate": 9.71542122749001e-07, + "loss": 0.0414, + "num_tokens": 421364804.0, + "reward": 1.154296875, + "reward_std": 0.25699377059936523, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.22979721426963806, + "rewards/tag_count_reward/mean": 0.962890625, + "rewards/tag_count_reward/std": 0.1357729285955429, "step": 594 }, { @@ -17241,27 +17241,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.158203125, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 1195.353515625, - "completions/mean_terminated_length": 1035.111328125, - "completions/min_length": 275.0, - "completions/min_terminated_length": 275.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1201.599609375, + "completions/mean_terminated_length": 1097.65576171875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.2031236664675258, - "grad_norm": 1.442400574684143, - "kl": 0.55322265625, - "learning_rate": 9.711871666274021e-07, - "loss": 0.2017, - "num_tokens": 394158898.0, - "reward": 0.9970703125, - "reward_std": 0.3248283267021179, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, + "grad_norm": 0.7900511026382446, + "kl": 0.168701171875, + "learning_rate": 9.713541296233884e-07, + "loss": 0.1118, + "num_tokens": 422054839.0, + "reward": 1.0703125, + "reward_std": 0.26460564136505127, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8837890625, - "rewards/tag_count_reward/std": 0.2554128170013428, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.16772332787513733, "step": 595 }, { @@ -17270,27 +17270,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.107421875, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1080.25390625, - "completions/mean_terminated_length": 963.7855834960938, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1111.208984375, + "completions/mean_terminated_length": 1033.96826171875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.20346505078091662, - "grad_norm": 1.8731517791748047, - "kl": 0.53271484375, - "learning_rate": 9.709981171205977e-07, - "loss": 0.1676, - "num_tokens": 394777940.0, - "reward": 1.068359375, - "reward_std": 0.30113649368286133, - "rewards/accuracy_reward/mean": 0.16532258689403534, - "rewards/accuracy_reward/std": 0.371846467256546, + "grad_norm": 1.1540671586990356, + "kl": 0.1524658203125, + "learning_rate": 9.711655380088249e-07, + "loss": 0.0708, + "num_tokens": 422689730.0, + "reward": 1.12109375, + "reward_std": 0.23350730538368225, + "rewards/accuracy_reward/mean": 0.17741934955120087, + "rewards/accuracy_reward/std": 0.38240888714790344, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.908203125, - "rewards/tag_count_reward/std": 0.21979151666164398, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.14873693883419037, "step": 596 }, { @@ -17299,27 +17299,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.18359375, + "completions/clipped_ratio": 0.111328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 1195.24609375, - "completions/mean_terminated_length": 1003.4784545898438, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1189.44140625, + "completions/mean_terminated_length": 1081.8857421875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.2038064350943074, - "grad_norm": 3.4311375617980957, - "kl": 0.9697265625, - "learning_rate": 9.708084700839678e-07, - "loss": 0.2665, - "num_tokens": 395461810.0, - "reward": 0.95458984375, - "reward_std": 0.36056941747665405, - "rewards/accuracy_reward/mean": 0.10080645233392715, - "rewards/accuracy_reward/std": 0.30137622356414795, + "grad_norm": 0.5086233019828796, + "kl": 0.1591796875, + "learning_rate": 9.709763481731853e-07, + "loss": 0.0815, + "num_tokens": 423370628.0, + "reward": 1.02880859375, + "reward_std": 0.23846885561943054, + "rewards/accuracy_reward/mean": 0.10282257944345474, + "rewards/accuracy_reward/std": 0.30403366684913635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85693359375, - "rewards/tag_count_reward/std": 0.2796345353126526, + "rewards/tag_count_reward/mean": 0.92919921875, + "rewards/tag_count_reward/std": 0.17967121303081512, "step": 597 }, { @@ -17328,27 +17328,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.119140625, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1077.2734375, - "completions/mean_terminated_length": 945.9778442382812, - "completions/min_length": 296.0, - "completions/min_terminated_length": 296.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1081.880859375, + "completions/mean_terminated_length": 1019.6154174804688, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, "epoch": 0.20414781940769822, - "grad_norm": 3.1357645988464355, - "kl": 0.65234375, - "learning_rate": 9.706182257866812e-07, - "loss": 0.1647, - "num_tokens": 396095806.0, - "reward": 1.0810546875, - "reward_std": 0.2692202031612396, - "rewards/accuracy_reward/mean": 0.169921875, - "rewards/accuracy_reward/std": 0.3759314715862274, + "grad_norm": 0.4582197666168213, + "kl": 0.1004638671875, + "learning_rate": 9.707865603851936e-07, + "loss": 0.0686, + "num_tokens": 424006983.0, + "reward": 1.12744140625, + "reward_std": 0.2185264229774475, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9111328125, - "rewards/tag_count_reward/std": 0.2280760258436203, + "rewards/tag_count_reward/mean": 0.95556640625, + "rewards/tag_count_reward/std": 0.13937683403491974, "step": 598 }, { @@ -17357,27 +17357,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.142578125, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 1061.494140625, - "completions/mean_terminated_length": 897.4510498046875, - "completions/min_length": 72.0, - "completions/min_terminated_length": 72.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1035.8046875, + "completions/mean_terminated_length": 981.654296875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, "epoch": 0.204489203721089, - "grad_norm": 3.2705795764923096, - "kl": 0.9462890625, - "learning_rate": 9.704273844987555e-07, - "loss": 0.2939, - "num_tokens": 396724923.0, - "reward": 1.00537109375, - "reward_std": 0.3218349814414978, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, + "grad_norm": 0.2900364398956299, + "kl": 0.0867919921875, + "learning_rate": 9.70596174914423e-07, + "loss": 0.0377, + "num_tokens": 424622947.0, + "reward": 1.1083984375, + "reward_std": 0.22132006287574768, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88818359375, - "rewards/tag_count_reward/std": 0.24445319175720215, + "rewards/tag_count_reward/mean": 0.9638671875, + "rewards/tag_count_reward/std": 0.1267271488904953, "step": 599 }, { @@ -17386,27 +17386,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.115234375, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 1054.888671875, - "completions/mean_terminated_length": 925.5430297851562, - "completions/min_length": 203.0, - "completions/min_terminated_length": 203.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1044.212890625, + "completions/mean_terminated_length": 1005.52734375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, "epoch": 0.20483058803447982, - "grad_norm": 10.378336906433105, - "kl": 1.1484375, - "learning_rate": 9.702359464910546e-07, - "loss": 0.2279, - "num_tokens": 397344994.0, - "reward": 1.06201171875, - "reward_std": 0.2892497181892395, - "rewards/accuracy_reward/mean": 0.158203125, - "rewards/accuracy_reward/std": 0.36528825759887695, + "grad_norm": 0.3275425434112549, + "kl": 0.0880126953125, + "learning_rate": 9.704051920312964e-07, + "loss": 0.0347, + "num_tokens": 425237552.0, + "reward": 1.10888671875, + "reward_std": 0.19505611062049866, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90380859375, - "rewards/tag_count_reward/std": 0.23124386370182037, + "rewards/tag_count_reward/mean": 0.96630859375, + "rewards/tag_count_reward/std": 0.11998306214809418, "step": 600 }, { @@ -17415,27 +17415,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.103515625, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 1068.806640625, - "completions/mean_terminated_length": 955.74072265625, - "completions/min_length": 261.0, - "completions/min_terminated_length": 261.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1117.275390625, + "completions/mean_terminated_length": 1027.5909423828125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.20517197234787063, - "grad_norm": 6.7024993896484375, - "kl": 1.14453125, - "learning_rate": 9.700439120352898e-07, - "loss": 0.2144, - "num_tokens": 397968351.0, - "reward": 1.05126953125, - "reward_std": 0.3121991753578186, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, + "grad_norm": 3.3291659355163574, + "kl": 0.16650390625, + "learning_rate": 9.702136120070845e-07, + "loss": 0.0757, + "num_tokens": 425885725.0, + "reward": 1.0830078125, + "reward_std": 0.27318036556243896, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.20940732955932617, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.16208821535110474, "step": 601 }, { @@ -17444,27 +17444,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.095703125, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 1000.607421875, - "completions/mean_terminated_length": 889.76025390625, - "completions/min_length": 236.0, - "completions/min_terminated_length": 236.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1092.35546875, + "completions/mean_terminated_length": 1011.36865234375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, "epoch": 0.20551335666126141, - "grad_norm": 6.8327250480651855, - "kl": 0.7802734375, - "learning_rate": 9.69851281404019e-07, - "loss": 0.1916, - "num_tokens": 398559590.0, - "reward": 1.0224609375, - "reward_std": 0.22936491668224335, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, + "grad_norm": 3.173513889312744, + "kl": 0.178955078125, + "learning_rate": 9.700214351139064e-07, + "loss": 0.1079, + "num_tokens": 426523939.0, + "reward": 1.08349609375, + "reward_std": 0.21784833073616028, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.20141369104385376, + "rewards/tag_count_reward/mean": 0.94873046875, + "rewards/tag_count_reward/std": 0.15697528421878815, "step": 602 }, { @@ -17473,27 +17473,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.119140625, + "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 1099.17578125, - "completions/mean_terminated_length": 970.8425903320312, - "completions/min_length": 271.0, - "completions/min_terminated_length": 271.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1231.22265625, + "completions/mean_terminated_length": 1106.130615234375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, "epoch": 0.20585474097465223, - "grad_norm": 2.3790547847747803, - "kl": 0.669921875, - "learning_rate": 9.696580548706462e-07, - "loss": 0.1783, - "num_tokens": 399197216.0, - "reward": 1.03662109375, - "reward_std": 0.28269410133361816, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, + "grad_norm": 10.558650016784668, + "kl": 0.2490234375, + "learning_rate": 9.69828661624729e-07, + "loss": 0.117, + "num_tokens": 427229173.0, + "reward": 1.06640625, + "reward_std": 0.3105151355266571, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.21077179908752441, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.2203473001718521, "step": 603 }, { @@ -17502,27 +17502,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.099609375, + "completions/clipped_ratio": 0.123046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 1123.58984375, - "completions/mean_terminated_length": 1021.3232421875, - "completions/min_length": 262.0, - "completions/min_terminated_length": 262.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1266.990234375, + "completions/mean_terminated_length": 1157.4053955078125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, "epoch": 0.20619612528804301, - "grad_norm": 5.899009704589844, - "kl": 0.904296875, - "learning_rate": 9.69464232709421e-07, - "loss": 0.1753, - "num_tokens": 399851470.0, - "reward": 1.0283203125, - "reward_std": 0.2769412398338318, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, + "grad_norm": 0.7882794141769409, + "kl": 0.093017578125, + "learning_rate": 9.696352918133672e-07, + "loss": 0.103, + "num_tokens": 427956848.0, + "reward": 1.06640625, + "reward_std": 0.2940067648887634, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.2063627392053604, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.19534705579280853, "step": 604 }, { @@ -17531,27 +17531,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.09375, + "completions/clipped_ratio": 0.123046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 1084.873046875, - "completions/mean_terminated_length": 985.2391967773438, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1214.0390625, + "completions/mean_terminated_length": 1097.0245361328125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.20653750960143383, - "grad_norm": 1.9711962938308716, - "kl": 0.537109375, - "learning_rate": 9.692698151954383e-07, - "loss": 0.1471, - "num_tokens": 400480045.0, - "reward": 1.08837890625, - "reward_std": 0.2671680748462677, - "rewards/accuracy_reward/mean": 0.15625, - "rewards/accuracy_reward/std": 0.36344730854034424, + "grad_norm": 0.7613054513931274, + "kl": 0.073974609375, + "learning_rate": 9.694413259544815e-07, + "loss": 0.1006, + "num_tokens": 428651556.0, + "reward": 1.0986328125, + "reward_std": 0.2871810495853424, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.18214841187000275, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.18964596092700958, "step": 605 }, { @@ -17560,27 +17560,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.068359375, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 1009.521484375, - "completions/mean_terminated_length": 933.3228149414062, - "completions/min_length": 281.0, - "completions/min_terminated_length": 281.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1149.650390625, + "completions/mean_terminated_length": 1043.7314453125, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, "epoch": 0.2068788939148246, - "grad_norm": 4.671616077423096, - "kl": 0.58447265625, - "learning_rate": 9.690748026046386e-07, - "loss": 0.1619, - "num_tokens": 401074696.0, - "reward": 1.04296875, - "reward_std": 0.2465844303369522, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, + "grad_norm": 0.3430488109588623, + "kl": 0.068115234375, + "learning_rate": 9.692467643235805e-07, + "loss": 0.0927, + "num_tokens": 429317953.0, + "reward": 1.013671875, + "reward_std": 0.23991644382476807, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.17128607630729675, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.20555779337882996, "step": 606 }, { @@ -17589,27 +17589,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08203125, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1923.0, - "completions/mean_length": 1083.345703125, - "completions/mean_terminated_length": 997.1425170898438, - "completions/min_length": 280.0, - "completions/min_terminated_length": 280.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1251.81640625, + "completions/mean_terminated_length": 1146.1282958984375, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, "epoch": 0.20722027822821543, - "grad_norm": 2.694904327392578, - "kl": 0.59326171875, - "learning_rate": 9.688791952138068e-07, - "loss": 0.1555, - "num_tokens": 401703593.0, - "reward": 1.0068359375, - "reward_std": 0.21862350404262543, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, + "grad_norm": 1.1188578605651855, + "kl": 0.09454345703125, + "learning_rate": 9.690516071970182e-07, + "loss": 0.0884, + "num_tokens": 430033107.0, + "reward": 1.00927734375, + "reward_std": 0.22156324982643127, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.18798606097698212, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.19800402224063873, "step": 607 }, { @@ -17618,27 +17618,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08203125, + "completions/clipped_ratio": 0.119140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 1075.267578125, - "completions/mean_terminated_length": 988.342529296875, - "completions/min_length": 233.0, - "completions/min_terminated_length": 233.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1188.490234375, + "completions/mean_terminated_length": 1072.2371826171875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, "epoch": 0.2075616625416062, - "grad_norm": 2.220994710922241, - "kl": 0.68994140625, - "learning_rate": 9.686829933005709e-07, - "loss": 0.1197, - "num_tokens": 402333810.0, - "reward": 1.02587890625, - "reward_std": 0.21877221763134003, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, + "grad_norm": 1.0040582418441772, + "kl": 0.07025146484375, + "learning_rate": 9.688558548519946e-07, + "loss": 0.1061, + "num_tokens": 430721294.0, + "reward": 1.0009765625, + "reward_std": 0.24179868400096893, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.18222182989120483, + "rewards/tag_count_reward/mean": 0.9052734375, + "rewards/tag_count_reward/std": 0.21629218757152557, "step": 608 }, { @@ -17647,27 +17647,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08984375, + "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1978.0, - "completions/mean_length": 1006.33984375, - "completions/mean_terminated_length": 903.5150146484375, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1205.611328125, + "completions/mean_terminated_length": 1067.765869140625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, "epoch": 0.20790304685499703, - "grad_norm": 1.1816500425338745, - "kl": 0.6005859375, - "learning_rate": 9.684861971434043e-07, - "loss": 0.1771, - "num_tokens": 402925808.0, - "reward": 0.99365234375, - "reward_std": 0.27654939889907837, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, + "grad_norm": 0.3633801341056824, + "kl": 0.103759765625, + "learning_rate": 9.686595075665552e-07, + "loss": 0.0856, + "num_tokens": 431415319.0, + "reward": 1.015625, + "reward_std": 0.27655965089797974, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.21019071340560913, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.21316158771514893, "step": 609 }, { @@ -17676,27 +17676,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.091796875, + "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 1075.279296875, - "completions/mean_terminated_length": 976.9613037109375, - "completions/min_length": 224.0, - "completions/min_terminated_length": 224.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1242.548828125, + "completions/mean_terminated_length": 1119.1915283203125, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, "epoch": 0.2082444311683878, - "grad_norm": 1.2841227054595947, - "kl": 0.580078125, - "learning_rate": 9.682888070216231e-07, - "loss": 0.1514, - "num_tokens": 403563439.0, - "reward": 1.0458984375, - "reward_std": 0.30254921317100525, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, + "grad_norm": 0.6648967266082764, + "kl": 0.08544921875, + "learning_rate": 9.684625656195908e-07, + "loss": 0.0911, + "num_tokens": 432138592.0, + "reward": 1.0888671875, + "reward_std": 0.33590537309646606, + "rewards/accuracy_reward/mean": 0.177734375, + "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.20354676246643066, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.20904938876628876, "step": 610 }, { @@ -17705,27 +17705,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.09765625, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1889.0, - "completions/mean_length": 1040.328125, - "completions/mean_terminated_length": 931.272705078125, - "completions/min_length": 288.0, - "completions/min_terminated_length": 288.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1166.6875, + "completions/mean_terminated_length": 1092.0, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, "epoch": 0.20858581548177862, - "grad_norm": 2.6428682804107666, - "kl": 0.8291015625, - "learning_rate": 9.680908232153865e-07, - "loss": 0.2066, - "num_tokens": 404175207.0, - "reward": 1.033203125, - "reward_std": 0.2689477801322937, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, + "grad_norm": 0.6680558323860168, + "kl": 0.090087890625, + "learning_rate": 9.682650292908362e-07, + "loss": 0.0837, + "num_tokens": 432815056.0, + "reward": 1.095703125, + "reward_std": 0.25818902254104614, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.2037649303674698, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.1720430850982666, "step": 611 }, { @@ -17734,27 +17734,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.109375, + "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1054.154296875, - "completions/mean_terminated_length": 932.1030883789062, - "completions/min_length": 207.0, - "completions/min_terminated_length": 207.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1179.91796875, + "completions/mean_terminated_length": 1090.1163330078125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.2089271997951694, - "grad_norm": 3.411529064178467, - "kl": 0.8994140625, - "learning_rate": 9.67892246005696e-07, - "loss": 0.2368, - "num_tokens": 404799126.0, - "reward": 0.93115234375, - "reward_std": 0.2242891639471054, - "rewards/accuracy_reward/mean": 0.01953125, - "rewards/accuracy_reward/std": 0.1385180652141571, + "grad_norm": 0.5763041973114014, + "kl": 0.1025390625, + "learning_rate": 9.680668988608708e-07, + "loss": 0.0878, + "num_tokens": 433503366.0, + "reward": 0.98828125, + "reward_std": 0.18409407138824463, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.22201864421367645, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.16771192848682404, "step": 612 }, { @@ -17763,27 +17763,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.076171875, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 1026.0078125, - "completions/mean_terminated_length": 941.7420654296875, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1177.51953125, + "completions/mean_terminated_length": 1109.713623046875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.20926858410856022, - "grad_norm": 2.8635215759277344, - "kl": 0.857421875, - "learning_rate": 9.67693075674396e-07, - "loss": 0.1983, - "num_tokens": 405396170.0, - "reward": 1.04638671875, - "reward_std": 0.26304829120635986, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, + "grad_norm": 0.5027387738227844, + "kl": 0.144287109375, + "learning_rate": 9.678681746111186e-07, + "loss": 0.0387, + "num_tokens": 434177984.0, + "reward": 1.072265625, + "reward_std": 0.26588761806488037, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.18323110044002533, + "rewards/tag_count_reward/mean": 0.947265625, + "rewards/tag_count_reward/std": 0.16299647092819214, "step": 613 }, { @@ -17792,27 +17792,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.06640625, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, - "completions/mean_length": 983.9609375, - "completions/mean_terminated_length": 908.276123046875, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/mean_length": 1122.59765625, + "completions/mean_terminated_length": 1044.1737060546875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.209609968421951, - "grad_norm": 1.214871883392334, - "kl": 0.53515625, - "learning_rate": 9.674933125041722e-07, - "loss": 0.1449, - "num_tokens": 405978086.0, - "reward": 1.05126953125, - "reward_std": 0.2484574317932129, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, + "grad_norm": 0.4440973699092865, + "kl": 0.1669921875, + "learning_rate": 9.676688568238456e-07, + "loss": 0.0751, + "num_tokens": 434830882.0, + "reward": 1.0751953125, + "reward_std": 0.22314193844795227, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.17949029803276062, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.16655419766902924, "step": 614 }, { @@ -17821,27 +17821,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08984375, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1961.0, - "completions/mean_length": 1032.369140625, - "completions/mean_terminated_length": 932.1137084960938, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1151.62109375, + "completions/mean_terminated_length": 1079.7593994140625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.20995135273534182, - "grad_norm": 5.269324779510498, - "kl": 0.71533203125, - "learning_rate": 9.672929567785517e-07, - "loss": 0.2549, - "num_tokens": 406588067.0, - "reward": 1.02783203125, - "reward_std": 0.2760624885559082, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, + "grad_norm": 1.4469910860061646, + "kl": 0.19482421875, + "learning_rate": 9.67468945782162e-07, + "loss": 0.117, + "num_tokens": 435501920.0, + "reward": 1.0712890625, + "reward_std": 0.22600241005420685, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.19585449993610382, + "rewards/tag_count_reward/mean": 0.9482421875, + "rewards/tag_count_reward/std": 0.14922119677066803, "step": 615 }, { @@ -17850,27 +17850,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.095703125, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 978.466796875, - "completions/mean_terminated_length": 865.2764282226562, - "completions/min_length": 219.0, - "completions/min_terminated_length": 219.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1113.05859375, + "completions/mean_terminated_length": 1035.9703369140625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, "epoch": 0.2102927370487326, - "grad_norm": 5.354698181152344, - "kl": 0.953125, - "learning_rate": 9.67092008781903e-07, - "loss": 0.2332, - "num_tokens": 407172866.0, - "reward": 0.99609375, - "reward_std": 0.24358966946601868, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, + "grad_norm": 51.226444244384766, + "kl": 1.251953125, + "learning_rate": 9.672684417700203e-07, + "loss": 0.1569, + "num_tokens": 436155630.0, + "reward": 1.013671875, + "reward_std": 0.17954176664352417, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.20074127614498138, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.1519906222820282, "step": 616 }, { @@ -17879,27 +17879,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.123046875, + "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 1043.744140625, - "completions/mean_terminated_length": 902.835205078125, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1200.74609375, + "completions/mean_terminated_length": 1104.969482421875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.21063412136212342, - "grad_norm": 2.5083742141723633, - "kl": 1.2353515625, - "learning_rate": 9.668904687994351e-07, - "loss": 0.2444, - "num_tokens": 407790127.0, - "reward": 0.96826171875, - "reward_std": 0.27520516514778137, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, + "grad_norm": 0.9761913418769836, + "kl": 0.230224609375, + "learning_rate": 9.67067345072215e-07, + "loss": 0.1219, + "num_tokens": 436853276.0, + "reward": 1.0224609375, + "reward_std": 0.2553250193595886, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.89794921875, - "rewards/tag_count_reward/std": 0.22330179810523987, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.17727059125900269, "step": 617 }, { @@ -17908,27 +17908,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.1171875, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 1018.123046875, - "completions/mean_terminated_length": 881.4136962890625, - "completions/min_length": 233.0, - "completions/min_terminated_length": 233.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1192.8671875, + "completions/mean_terminated_length": 1112.4700927734375, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, "epoch": 0.2109755056755142, - "grad_norm": 2.2218728065490723, - "kl": 1.0400390625, - "learning_rate": 9.66688337117197e-07, - "loss": 0.2165, - "num_tokens": 408380542.0, - "reward": 1.08837890625, - "reward_std": 0.2701031565666199, - "rewards/accuracy_reward/mean": 0.17578125, - "rewards/accuracy_reward/std": 0.3810062110424042, + "grad_norm": 3.313305616378784, + "kl": 0.2674560546875, + "learning_rate": 9.668656559743827e-07, + "loss": 0.0622, + "num_tokens": 437533160.0, + "reward": 1.1640625, + "reward_std": 0.22554805874824524, + "rewards/accuracy_reward/mean": 0.220703125, + "rewards/accuracy_reward/std": 0.4151262938976288, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.20284785330295563, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.1547197550535202, "step": 618 }, { @@ -17937,27 +17937,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 1019.171875, - "completions/mean_terminated_length": 872.1964721679688, - "completions/min_length": 279.0, - "completions/min_terminated_length": 279.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1152.23828125, + "completions/mean_terminated_length": 1092.5208740234375, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, "epoch": 0.21131688998890502, - "grad_norm": 4.6177239418029785, - "kl": 1.50390625, - "learning_rate": 9.664856140220778e-07, - "loss": 0.3002, - "num_tokens": 408980726.0, - "reward": 0.96923828125, - "reward_std": 0.25260913372039795, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, + "grad_norm": 0.35532379150390625, + "kl": 0.153564453125, + "learning_rate": 9.666633747630017e-07, + "loss": 0.0594, + "num_tokens": 438201474.0, + "reward": 1.06103515625, + "reward_std": 0.17333631217479706, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88916015625, - "rewards/tag_count_reward/std": 0.22293777763843536, + "rewards/tag_count_reward/mean": 0.95556640625, + "rewards/tag_count_reward/std": 0.1411210000514984, "step": 619 }, { @@ -17966,27 +17966,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.134765625, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1983.0, - "completions/mean_length": 944.771484375, - "completions/mean_terminated_length": 772.9368286132812, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1061.30078125, + "completions/mean_terminated_length": 999.8880004882812, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.2116582743022958, - "grad_norm": 3.555220127105713, - "kl": 1.1787109375, - "learning_rate": 9.662822998018056e-07, - "loss": 0.295, - "num_tokens": 409541921.0, - "reward": 0.9599609375, - "reward_std": 0.29147058725357056, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, + "grad_norm": 13.381671905517578, + "kl": 0.259033203125, + "learning_rate": 9.66460501725391e-07, + "loss": 0.0589, + "num_tokens": 438822332.0, + "reward": 1.072265625, + "reward_std": 0.20817849040031433, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8876953125, - "rewards/tag_count_reward/std": 0.23478132486343384, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1379096806049347, "step": 620 }, { @@ -17995,27 +17995,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.177734375, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 1059.083984375, - "completions/mean_terminated_length": 845.3278198242188, - "completions/min_length": 211.0, - "completions/min_terminated_length": 211.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1129.638671875, + "completions/mean_terminated_length": 1066.3695068359375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.21199965861568662, - "grad_norm": 3.064396619796753, - "kl": 1.318359375, - "learning_rate": 9.66078394744948e-07, - "loss": 0.339, - "num_tokens": 410162044.0, - "reward": 0.9443359375, - "reward_std": 0.30136334896087646, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, + "grad_norm": 12.081338882446289, + "kl": 0.30029296875, + "learning_rate": 9.662570371497098e-07, + "loss": 0.0854, + "num_tokens": 439478579.0, + "reward": 1.0810546875, + "reward_std": 0.2389516532421112, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8662109375, - "rewards/tag_count_reward/std": 0.24364906549453735, + "rewards/tag_count_reward/mean": 0.9501953125, + "rewards/tag_count_reward/std": 0.15311495959758759, "step": 621 }, { @@ -18024,27 +18024,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.095703125, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1915.0, - "completions/mean_length": 923.7109375, - "completions/mean_terminated_length": 804.7257080078125, - "completions/min_length": 266.0, - "completions/min_terminated_length": 266.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1067.845703125, + "completions/mean_terminated_length": 1002.5021362304688, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, "epoch": 0.2123410429290774, - "grad_norm": 2.5860583782196045, - "kl": 1.013671875, - "learning_rate": 9.65873899140911e-07, - "loss": 0.2859, - "num_tokens": 410713128.0, - "reward": 1.04443359375, - "reward_std": 0.29433169960975647, - "rewards/accuracy_reward/mean": 0.138671875, - "rewards/accuracy_reward/std": 0.34594178199768066, + "grad_norm": 0.3467945456504822, + "kl": 0.1341552734375, + "learning_rate": 9.660529813249586e-07, + "loss": 0.0564, + "num_tokens": 440103460.0, + "reward": 1.10693359375, + "reward_std": 0.2673289179801941, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90576171875, - "rewards/tag_count_reward/std": 0.20519430935382843, + "rewards/tag_count_reward/mean": 0.94873046875, + "rewards/tag_count_reward/std": 0.14649643003940582, "step": 622 }, { @@ -18053,27 +18053,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.099609375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 875.408203125, - "completions/mean_terminated_length": 745.6854858398438, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 979.154296875, + "completions/mean_terminated_length": 940.20849609375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.21268242724246822, - "grad_norm": 2.3693439960479736, - "kl": 1.138671875, - "learning_rate": 9.656688132799382e-07, - "loss": 0.2828, - "num_tokens": 411235097.0, - "reward": 1.05859375, - "reward_std": 0.32291650772094727, - "rewards/accuracy_reward/mean": 0.158203125, - "rewards/accuracy_reward/std": 0.36528825759887695, - "rewards/format_reward/mean": 0.0, - "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.900390625, - "rewards/tag_count_reward/std": 0.21804559230804443, + "grad_norm": 8.4327974319458, + "kl": 0.3653564453125, + "learning_rate": 9.65848334540977e-07, + "loss": 0.0686, + "num_tokens": 440678547.0, + "reward": 1.20458984375, + "reward_std": 0.25466054677963257, + "rewards/accuracy_reward/mean": 0.23828125, + "rewards/accuracy_reward/std": 0.42644867300987244, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96630859375, + "rewards/tag_count_reward/std": 0.12883129715919495, "step": 623 }, { @@ -18082,27 +18082,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.20703125, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 1061.150390625, - "completions/mean_terminated_length": 803.5, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1137.7578125, + "completions/mean_terminated_length": 1050.047119140625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.213023811555859, - "grad_norm": 2.54702091217041, - "kl": 1.3125, - "learning_rate": 9.65463137453112e-07, - "loss": 0.3377, - "num_tokens": 411867830.0, - "reward": 0.92919921875, - "reward_std": 0.3174704909324646, - "rewards/accuracy_reward/mean": 0.0927419364452362, - "rewards/accuracy_reward/std": 0.2903633117675781, + "grad_norm": 0.7812462449073792, + "kl": 0.1260986328125, + "learning_rate": 9.656430970884437e-07, + "loss": 0.0806, + "num_tokens": 441350503.0, + "reward": 1.0771484375, + "reward_std": 0.2545987069606781, + "rewards/accuracy_reward/mean": 0.14717741310596466, + "rewards/accuracy_reward/std": 0.354640394449234, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83935546875, - "rewards/tag_count_reward/std": 0.2597397565841675, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.1680619716644287, "step": 624 }, { @@ -18111,27 +18111,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.20703125, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1057.361328125, - "completions/mean_terminated_length": 798.7216796875, - "completions/min_length": 192.0, - "completions/min_terminated_length": 192.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1145.984375, + "completions/mean_terminated_length": 1075.7220458984375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, "epoch": 0.21336519586924982, - "grad_norm": 11.175084114074707, - "kl": 1.546875, - "learning_rate": 9.652568719523516e-07, - "loss": 0.352, - "num_tokens": 412487919.0, - "reward": 0.921875, - "reward_std": 0.27674612402915955, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, + "grad_norm": 1.5636364221572876, + "kl": 0.119140625, + "learning_rate": 9.65437269258877e-07, + "loss": 0.1033, + "num_tokens": 442015967.0, + "reward": 1.05419921875, + "reward_std": 0.21494579315185547, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.841796875, - "rewards/tag_count_reward/std": 0.25722646713256836, + "rewards/tag_count_reward/mean": 0.94482421875, + "rewards/tag_count_reward/std": 0.16331200301647186, "step": 625 }, { @@ -18140,27 +18140,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.169921875, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 965.78515625, - "completions/mean_terminated_length": 744.2493896484375, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1055.650390625, + "completions/mean_terminated_length": 1008.9754638671875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.2137065801826406, - "grad_norm": 2.918363094329834, - "kl": 1.091796875, - "learning_rate": 9.650500170704127e-07, - "loss": 0.3223, - "num_tokens": 413062593.0, - "reward": 1.00048828125, - "reward_std": 0.3051578402519226, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, + "grad_norm": 11.236275672912598, + "kl": 0.2906494140625, + "learning_rate": 9.652308513446339e-07, + "loss": 0.0817, + "num_tokens": 442636652.0, + "reward": 1.1494140625, + "reward_std": 0.1985960602760315, + "rewards/accuracy_reward/mean": 0.201171875, + "rewards/accuracy_reward/std": 0.4012683033943176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87158203125, - "rewards/tag_count_reward/std": 0.23195762932300568, + "rewards/tag_count_reward/mean": 0.9482421875, + "rewards/tag_count_reward/std": 0.16180500388145447, "step": 626 }, { @@ -18169,27 +18169,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.21484375, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1790.0, - "completions/mean_length": 1026.69140625, - "completions/mean_terminated_length": 747.2288208007812, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1134.619140625, + "completions/mean_terminated_length": 1067.5994873046875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, "epoch": 0.21404796449603142, - "grad_norm": 1.993016004562378, - "kl": 1.376953125, - "learning_rate": 9.648425731008884e-07, - "loss": 0.3599, - "num_tokens": 413666819.0, - "reward": 0.90771484375, - "reward_std": 0.2868208587169647, - "rewards/accuracy_reward/mean": 0.06451612710952759, - "rewards/accuracy_reward/std": 0.2459181249141693, + "grad_norm": 0.43855372071266174, + "kl": 0.11474609375, + "learning_rate": 9.650238436389088e-07, + "loss": 0.0637, + "num_tokens": 443296137.0, + "reward": 1.0791015625, + "reward_std": 0.247659370303154, + "rewards/accuracy_reward/mean": 0.13104838132858276, + "rewards/accuracy_reward/std": 0.3377939760684967, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84521484375, - "rewards/tag_count_reward/std": 0.263744592666626, + "rewards/tag_count_reward/mean": 0.9521484375, + "rewards/tag_count_reward/std": 0.15213829278945923, "step": 627 }, { @@ -18198,27 +18198,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.212890625, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1949.0, - "completions/mean_length": 996.869140625, - "completions/mean_terminated_length": 712.5682373046875, - "completions/min_length": 182.0, - "completions/min_terminated_length": 182.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1024.248046875, + "completions/mean_terminated_length": 978.2836303710938, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.2143893488094222, - "grad_norm": 9.06557846069336, - "kl": 2.041015625, - "learning_rate": 9.646345403382073e-07, - "loss": 0.4058, - "num_tokens": 414256832.0, - "reward": 0.8994140625, - "reward_std": 0.2883273959159851, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, + "grad_norm": 0.6465330123901367, + "kl": 0.122314453125, + "learning_rate": 9.648162464357344e-07, + "loss": 0.0772, + "num_tokens": 443900168.0, + "reward": 1.1220703125, + "reward_std": 0.22566094994544983, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8505859375, - "rewards/tag_count_reward/std": 0.25678586959838867, + "rewards/tag_count_reward/mean": 0.9619140625, + "rewards/tag_count_reward/std": 0.13906539976596832, "step": 628 }, { @@ -18227,27 +18227,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.208984375, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 988.466796875, - "completions/mean_terminated_length": 708.540771484375, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1025.5625, + "completions/mean_terminated_length": 952.8367919921875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.21473073312281302, - "grad_norm": 9.490395545959473, - "kl": 1.888671875, - "learning_rate": 9.644259190776339e-07, - "loss": 0.4092, - "num_tokens": 414846015.0, - "reward": 0.8935546875, - "reward_std": 0.317560076713562, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, + "grad_norm": 5.626789569854736, + "kl": 0.2998046875, + "learning_rate": 9.646080600299802e-07, + "loss": 0.0728, + "num_tokens": 444508344.0, + "reward": 1.04345703125, + "reward_std": 0.23375311493873596, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8369140625, - "rewards/tag_count_reward/std": 0.26033368706703186, + "rewards/tag_count_reward/mean": 0.95556640625, + "rewards/tag_count_reward/std": 0.1511640101671219, "step": 629 }, { @@ -18256,27 +18256,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.27734375, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1917.0, - "completions/mean_length": 1080.869140625, - "completions/mean_terminated_length": 709.7000122070312, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1063.392578125, + "completions/mean_terminated_length": 1012.8480834960938, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, "epoch": 0.2150721174362038, - "grad_norm": 5.034303665161133, - "kl": 1.919921875, - "learning_rate": 9.642167096152678e-07, - "loss": 0.4097, - "num_tokens": 415472380.0, - "reward": 0.86279296875, - "reward_std": 0.3311331570148468, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, + "grad_norm": 0.5234067440032959, + "kl": 0.1243896484375, + "learning_rate": 9.643992847173535e-07, + "loss": 0.0793, + "num_tokens": 445125761.0, + "reward": 1.0537109375, + "reward_std": 0.2313232421875, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.81005859375, - "rewards/tag_count_reward/std": 0.27167558670043945, + "rewards/tag_count_reward/mean": 0.9580078125, + "rewards/tag_count_reward/std": 0.13881781697273254, "step": 630 }, { @@ -18285,27 +18285,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.275390625, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1978.0, - "completions/mean_length": 1102.31640625, - "completions/mean_terminated_length": 742.9056396484375, - "completions/min_length": 228.0, - "completions/min_terminated_length": 228.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1152.306640625, + "completions/mean_terminated_length": 1080.5, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.21541350174959462, - "grad_norm": 3.3463125228881836, - "kl": 1.89453125, - "learning_rate": 9.640069122480437e-07, - "loss": 0.419, - "num_tokens": 416111182.0, - "reward": 0.849609375, - "reward_std": 0.3056153655052185, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, + "grad_norm": 2.6268069744110107, + "kl": 0.244873046875, + "learning_rate": 9.641899207943971e-07, + "loss": 0.0832, + "num_tokens": 445790158.0, + "reward": 1.029296875, + "reward_std": 0.23583336174488068, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.814453125, - "rewards/tag_count_reward/std": 0.27581244707107544, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.16007427871227264, "step": 631 }, { @@ -18314,27 +18314,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.255859375, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 1081.791015625, - "completions/mean_terminated_length": 749.5774536132812, - "completions/min_length": 217.0, - "completions/min_terminated_length": 217.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1067.2109375, + "completions/mean_terminated_length": 1008.322998046875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, "epoch": 0.2157548860629854, - "grad_norm": 2.0118744373321533, - "kl": 1.451171875, - "learning_rate": 9.637965272737305e-07, - "loss": 0.3259, - "num_tokens": 416742259.0, - "reward": 0.95458984375, - "reward_std": 0.3619065284729004, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, + "grad_norm": 2.4828763008117676, + "kl": 0.1898193359375, + "learning_rate": 9.639799685584907e-07, + "loss": 0.0496, + "num_tokens": 446413770.0, + "reward": 1.20263671875, + "reward_std": 0.31869786977767944, + "rewards/accuracy_reward/mean": 0.251953125, + "rewards/accuracy_reward/std": 0.43455907702445984, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.82958984375, - "rewards/tag_count_reward/std": 0.27249738574028015, + "rewards/tag_count_reward/mean": 0.95068359375, + "rewards/tag_count_reward/std": 0.15207155048847198, "step": 632 }, { @@ -18343,27 +18343,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.275390625, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 1102.6484375, - "completions/mean_terminated_length": 743.3638916015625, - "completions/min_length": 224.0, - "completions/min_terminated_length": 224.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1151.234375, + "completions/mean_terminated_length": 1060.5936279296875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.21609627037637621, - "grad_norm": 3.584123373031616, - "kl": 1.57421875, - "learning_rate": 9.635855549909314e-07, - "loss": 0.4054, - "num_tokens": 417380319.0, - "reward": 0.84814453125, - "reward_std": 0.30755841732025146, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, + "grad_norm": 0.9391236901283264, + "kl": 0.1937255859375, + "learning_rate": 9.63769428307849e-07, + "loss": 0.0704, + "num_tokens": 447076706.0, + "reward": 1.0625, + "reward_std": 0.23874926567077637, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80712890625, - "rewards/tag_count_reward/std": 0.2745445668697357, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.182897686958313, "step": 633 }, { @@ -18372,27 +18372,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.208984375, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, - "completions/mean_length": 981.1484375, - "completions/mean_terminated_length": 699.2889404296875, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/mean_length": 1042.548828125, + "completions/mean_terminated_length": 966.50634765625, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, "epoch": 0.216437654689767, - "grad_norm": 3.4458224773406982, - "kl": 1.330078125, - "learning_rate": 9.63373995699083e-07, - "loss": 0.3272, - "num_tokens": 417955531.0, - "reward": 0.92431640625, - "reward_std": 0.29383552074432373, - "rewards/accuracy_reward/mean": 0.08266129344701767, - "rewards/accuracy_reward/std": 0.2756476104259491, + "grad_norm": 1.5962361097335815, + "kl": 0.2001953125, + "learning_rate": 9.63558300341522e-07, + "loss": 0.0938, + "num_tokens": 447683355.0, + "reward": 1.0751953125, + "reward_std": 0.25834929943084717, + "rewards/accuracy_reward/mean": 0.13306452333927155, + "rewards/accuracy_reward/std": 0.3399873673915863, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.84423828125, - "rewards/tag_count_reward/std": 0.26036444306373596, + "rewards/tag_count_reward/mean": 0.9462890625, + "rewards/tag_count_reward/std": 0.16858422756195068, "step": 634 }, { @@ -18401,27 +18401,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.25, + "completions/clipped_ratio": 0.11328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1077.49609375, - "completions/mean_terminated_length": 753.9948120117188, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1205.935546875, + "completions/mean_terminated_length": 1098.3590087890625, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, "epoch": 0.21677903900315781, - "grad_norm": 1.7347800731658936, - "kl": 1.4765625, - "learning_rate": 9.631618496984546e-07, - "loss": 0.3759, - "num_tokens": 418586489.0, - "reward": 0.84375, - "reward_std": 0.2716478109359741, - "rewards/accuracy_reward/mean": 0.021484375, - "rewards/accuracy_reward/std": 0.14513419568538666, + "grad_norm": 5.905625820159912, + "kl": 0.414794921875, + "learning_rate": 9.63346584959395e-07, + "loss": 0.0961, + "num_tokens": 448380074.0, + "reward": 0.97216796875, + "reward_std": 0.2261180281639099, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.822265625, - "rewards/tag_count_reward/std": 0.2702690064907074, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.19671128690242767, "step": 635 }, { @@ -18430,27 +18430,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.27734375, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1909.0, - "completions/mean_length": 1107.6015625, - "completions/mean_terminated_length": 746.69189453125, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1121.923828125, + "completions/mean_terminated_length": 1051.884521484375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, "epoch": 0.2171204233165486, - "grad_norm": 2.8225364685058594, - "kl": 1.798828125, - "learning_rate": 9.62949117290149e-07, - "loss": 0.4617, - "num_tokens": 419229645.0, - "reward": 0.8427734375, - "reward_std": 0.3390008211135864, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, + "grad_norm": 1.1541615724563599, + "kl": 0.133544921875, + "learning_rate": 9.63134282462187e-07, + "loss": 0.1177, + "num_tokens": 449030563.0, + "reward": 1.04052734375, + "reward_std": 0.20850953459739685, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7919921875, - "rewards/tag_count_reward/std": 0.289990097284317, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.16927079856395721, "step": 636 }, { @@ -18459,27 +18459,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.34375, + "completions/clipped_ratio": 0.095703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 1199.63671875, - "completions/mean_terminated_length": 755.2559814453125, - "completions/min_length": 218.0, - "completions/min_terminated_length": 218.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1158.65625, + "completions/mean_terminated_length": 1064.53564453125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.2174618076299394, - "grad_norm": 3.7464311122894287, - "kl": 1.8359375, - "learning_rate": 9.627357987761007e-07, - "loss": 0.4191, - "num_tokens": 419923651.0, - "reward": 0.826171875, - "reward_std": 0.32486432790756226, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, + "grad_norm": 0.7348608374595642, + "kl": 0.143798828125, + "learning_rate": 9.629213931514513e-07, + "loss": 0.0863, + "num_tokens": 449703587.0, + "reward": 1.06689453125, + "reward_std": 0.23084154725074768, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.78125, - "rewards/tag_count_reward/std": 0.2905053198337555, + "rewards/tag_count_reward/mean": 0.94189453125, + "rewards/tag_count_reward/std": 0.16675129532814026, "step": 637 }, { @@ -18488,27 +18488,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.37109375, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1828.0, - "completions/mean_length": 1204.205078125, - "completions/mean_terminated_length": 706.3136596679688, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1109.015625, + "completions/mean_terminated_length": 1027.2781982421875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.2178031919433302, - "grad_norm": 4.913912296295166, - "kl": 2.041015625, - "learning_rate": 9.625218944590763e-07, - "loss": 0.4591, - "num_tokens": 420608700.0, - "reward": 0.8134765625, - "reward_std": 0.3354353904724121, - "rewards/accuracy_reward/mean": 0.05443548411130905, - "rewards/accuracy_reward/std": 0.227104052901268, + "grad_norm": 5.815976142883301, + "kl": 0.2303466796875, + "learning_rate": 9.627079173295747e-07, + "loss": 0.0975, + "num_tokens": 450339899.0, + "reward": 1.08642578125, + "reward_std": 0.24913567304611206, + "rewards/accuracy_reward/mean": 0.15120968222618103, + "rewards/accuracy_reward/std": 0.35861483216285706, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7607421875, - "rewards/tag_count_reward/std": 0.30549731850624084, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.1718478947877884, "step": 638 }, { @@ -18517,27 +18517,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.32421875, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 1095.244140625, - "completions/mean_terminated_length": 638.1416015625, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1069.744140625, + "completions/mean_terminated_length": 984.588134765625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.218144576256721, - "grad_norm": 3.983170509338379, - "kl": 2.02734375, - "learning_rate": 9.623074046426744e-07, - "loss": 0.4582, - "num_tokens": 421242665.0, - "reward": 0.8583984375, - "reward_std": 0.3076876401901245, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, + "grad_norm": 0.7231947183609009, + "kl": 0.1585693359375, + "learning_rate": 9.62493855299777e-07, + "loss": 0.0733, + "num_tokens": 450960808.0, + "reward": 1.09912109375, + "reward_std": 0.26701319217681885, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7880859375, - "rewards/tag_count_reward/std": 0.2905299961566925, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.1757296919822693, "step": 639 }, { @@ -18546,27 +18546,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.337890625, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1799.0, - "completions/mean_length": 1141.359375, - "completions/mean_terminated_length": 678.678466796875, - "completions/min_length": 218.0, - "completions/min_terminated_length": 218.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1105.728515625, + "completions/mean_terminated_length": 1036.589111328125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.2184859605701118, - "grad_norm": 2.0060925483703613, - "kl": 2.048828125, - "learning_rate": 9.620923296313234e-07, - "loss": 0.4543, - "num_tokens": 421902817.0, - "reward": 0.84765625, - "reward_std": 0.35959500074386597, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, + "grad_norm": 1.2328861951828003, + "kl": 0.254150390625, + "learning_rate": 9.622792073661107e-07, + "loss": 0.1195, + "num_tokens": 451602717.0, + "reward": 1.0859375, + "reward_std": 0.24592015147209167, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7578125, - "rewards/tag_count_reward/std": 0.304784893989563, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.16916421055793762, "step": 640 }, { @@ -18575,27 +18575,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.375, + "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1801.0, - "completions/mean_length": 1226.77734375, - "completions/mean_terminated_length": 734.0437622070312, - "completions/min_length": 219.0, - "completions/min_terminated_length": 219.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1137.111328125, + "completions/mean_terminated_length": 1034.1412353515625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, "epoch": 0.2188273448835026, - "grad_norm": 2.325049877166748, - "kl": 2.037109375, - "learning_rate": 9.618766697302835e-07, - "loss": 0.4852, - "num_tokens": 422605567.0, - "reward": 0.78125, - "reward_std": 0.3310891389846802, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, + "grad_norm": 1.3244597911834717, + "kl": 0.20556640625, + "learning_rate": 9.620639738334602e-07, + "loss": 0.1133, + "num_tokens": 452259558.0, + "reward": 1.060546875, + "reward_std": 0.21626397967338562, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.734375, - "rewards/tag_count_reward/std": 0.3104507029056549, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.17896848917007446, "step": 641 }, { @@ -18604,27 +18604,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.26171875, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 1047.201171875, - "completions/mean_terminated_length": 692.4205932617188, - "completions/min_length": 206.0, - "completions/min_terminated_length": 206.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1081.0625, + "completions/mean_terminated_length": 1001.3361206054688, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, "epoch": 0.2191687291968934, - "grad_norm": 4.015636920928955, - "kl": 1.654296875, - "learning_rate": 9.616604252456437e-07, - "loss": 0.4472, - "num_tokens": 423219302.0, - "reward": 0.84375, - "reward_std": 0.32642629742622375, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, + "grad_norm": 4.508296489715576, + "kl": 0.299072265625, + "learning_rate": 9.618481550075423e-07, + "loss": 0.0903, + "num_tokens": 452890630.0, + "reward": 1.041015625, + "reward_std": 0.23068498075008392, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.791015625, - "rewards/tag_count_reward/std": 0.29473087191581726, + "rewards/tag_count_reward/mean": 0.935546875, + "rewards/tag_count_reward/std": 0.18636520206928253, "step": 642 }, { @@ -18633,27 +18633,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.271484375, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 1144.35546875, - "completions/mean_terminated_length": 807.6085815429688, - "completions/min_length": 216.0, - "completions/min_terminated_length": 216.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1166.376953125, + "completions/mean_terminated_length": 1087.5936279296875, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, "epoch": 0.2195101135102842, - "grad_norm": 3.6396005153656006, - "kl": 1.47265625, - "learning_rate": 9.614435964843245e-07, - "loss": 0.3916, - "num_tokens": 423884940.0, - "reward": 0.81640625, - "reward_std": 0.3465976119041443, - "rewards/accuracy_reward/mean": 0.05443548411130905, - "rewards/accuracy_reward/std": 0.227104052901268, + "grad_norm": 14.663076400756836, + "kl": 0.561279296875, + "learning_rate": 9.616317511949047e-07, + "loss": 0.1207, + "num_tokens": 453567543.0, + "reward": 1.08056640625, + "reward_std": 0.24798694252967834, + "rewards/accuracy_reward/mean": 0.14919355511665344, + "rewards/accuracy_reward/std": 0.3566388487815857, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.763671875, - "rewards/tag_count_reward/std": 0.3081708550453186, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.17538133263587952, "step": 643 }, { @@ -18662,27 +18662,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2890625, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 1122.044921875, - "completions/mean_terminated_length": 745.5577392578125, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1087.89453125, + "completions/mean_terminated_length": 1021.74951171875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.219851497823675, - "grad_norm": 4.020759105682373, - "kl": 1.462890625, - "learning_rate": 9.612261837540738e-07, - "loss": 0.4117, - "num_tokens": 424533427.0, - "reward": 0.83837890625, - "reward_std": 0.32647258043289185, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, + "grad_norm": 1.0262993574142456, + "kl": 0.205810546875, + "learning_rate": 9.61414762702926e-07, + "loss": 0.076, + "num_tokens": 454198545.0, + "reward": 1.08056640625, + "reward_std": 0.24146193265914917, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.78369140625, - "rewards/tag_count_reward/std": 0.29997169971466064, + "rewards/tag_count_reward/mean": 0.95361328125, + "rewards/tag_count_reward/std": 0.15771618485450745, "step": 644 }, { @@ -18691,27 +18691,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.412109375, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1463.0, - "completions/mean_length": 1249.642578125, - "completions/mean_terminated_length": 689.9966430664062, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1069.38671875, + "completions/mean_terminated_length": 995.3740234375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.2201928821370658, - "grad_norm": 3.8461523056030273, - "kl": 1.94140625, - "learning_rate": 9.610081873634696e-07, - "loss": 0.4567, - "num_tokens": 425248124.0, - "reward": 0.73583984375, - "reward_std": 0.3610597252845764, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, + "grad_norm": 3.216106414794922, + "kl": 0.30322265625, + "learning_rate": 9.611971898398155e-07, + "loss": 0.0824, + "num_tokens": 454820951.0, + "reward": 1.072265625, + "reward_std": 0.21444615721702576, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.69677734375, - "rewards/tag_count_reward/std": 0.3291517496109009, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.14162877202033997, "step": 645 }, { @@ -18720,27 +18720,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.37109375, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1224.62890625, - "completions/mean_terminated_length": 738.788818359375, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1022.4375, + "completions/mean_terminated_length": 967.5719604492188, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.2205342664504566, - "grad_norm": 3.524672746658325, - "kl": 1.8984375, - "learning_rate": 9.607896076219181e-07, - "loss": 0.4526, - "num_tokens": 425963998.0, - "reward": 0.75634765625, - "reward_std": 0.336562842130661, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, + "grad_norm": 1.9107329845428467, + "kl": 0.269287109375, + "learning_rate": 9.609790329146124e-07, + "loss": 0.1166, + "num_tokens": 455433303.0, + "reward": 1.03564453125, + "reward_std": 0.22553111612796783, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.71923828125, - "rewards/tag_count_reward/std": 0.3142194449901581, + "rewards/tag_count_reward/mean": 0.94970703125, + "rewards/tag_count_reward/std": 0.1549411565065384, "step": 646 }, { @@ -18749,27 +18749,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.392578125, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1799.0, - "completions/mean_length": 1248.44921875, - "completions/mean_terminated_length": 731.69775390625, - "completions/min_length": 224.0, - "completions/min_terminated_length": 224.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1099.373046875, + "completions/mean_terminated_length": 1050.6756591796875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, "epoch": 0.2208756507638474, - "grad_norm": 2.5784406661987305, - "kl": 1.724609375, - "learning_rate": 9.605704448396529e-07, - "loss": 0.4351, - "num_tokens": 426684164.0, - "reward": 0.76806640625, - "reward_std": 0.3576868176460266, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, + "grad_norm": 1.1532723903656006, + "kl": 0.283203125, + "learning_rate": 9.607602922371855e-07, + "loss": 0.11, + "num_tokens": 456077142.0, + "reward": 1.0625, + "reward_std": 0.23354697227478027, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.69775390625, - "rewards/tag_count_reward/std": 0.31873929500579834, + "rewards/tag_count_reward/mean": 0.951171875, + "rewards/tag_count_reward/std": 0.1604439616203308, "step": 647 }, { @@ -18778,27 +18778,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.326171875, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1911.0, - "completions/mean_length": 1166.34765625, - "completions/mean_terminated_length": 739.5768432617188, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1034.953125, + "completions/mean_terminated_length": 998.04052734375, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, "epoch": 0.2212170350772382, - "grad_norm": 4.20598840713501, - "kl": 1.849609375, - "learning_rate": 9.603506993277354e-07, - "loss": 0.4638, - "num_tokens": 427353270.0, - "reward": 0.79931640625, - "reward_std": 0.3777962327003479, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 1.4442238807678223, + "kl": 0.2216796875, + "learning_rate": 9.605409681182328e-07, + "loss": 0.0786, + "num_tokens": 456678974.0, + "reward": 1.14013671875, + "reward_std": 0.23208163678646088, + "rewards/accuracy_reward/mean": 0.181640625, + "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.73291015625, - "rewards/tag_count_reward/std": 0.31796491146087646, + "rewards/tag_count_reward/mean": 0.95849609375, + "rewards/tag_count_reward/std": 0.13940425217151642, "step": 648 }, { @@ -18807,27 +18807,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.3125, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1823.0, - "completions/mean_length": 1143.755859375, - "completions/mean_terminated_length": 732.73583984375, - "completions/min_length": 218.0, - "completions/min_terminated_length": 218.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1062.14453125, + "completions/mean_terminated_length": 1009.4032592773438, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, "epoch": 0.221558419390629, - "grad_norm": 3.2356879711151123, - "kl": 1.5703125, - "learning_rate": 9.601303713980545e-07, - "loss": 0.4345, - "num_tokens": 428012521.0, - "reward": 0.79541015625, - "reward_std": 0.3835356831550598, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, + "grad_norm": 6.689726829528809, + "kl": 0.454345703125, + "learning_rate": 9.60321060869281e-07, + "loss": 0.0621, + "num_tokens": 457296440.0, + "reward": 1.0439453125, + "reward_std": 0.220662921667099, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.74462890625, - "rewards/tag_count_reward/std": 0.31334546208381653, + "rewards/tag_count_reward/mean": 0.9560546875, + "rewards/tag_count_reward/std": 0.14762458205223083, "step": 649 }, { @@ -18836,27 +18836,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.302734375, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 1155.529296875, - "completions/mean_terminated_length": 768.0420532226562, - "completions/min_length": 192.0, - "completions/min_terminated_length": 192.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 1094.9453125, + "completions/mean_terminated_length": 1027.15478515625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.2218998037040198, - "grad_norm": 4.234147071838379, - "kl": 1.3828125, - "learning_rate": 9.599094613633255e-07, - "loss": 0.3902, - "num_tokens": 428684888.0, - "reward": 0.8271484375, - "reward_std": 0.38090699911117554, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, + "grad_norm": 1.7962830066680908, + "kl": 0.34765625, + "learning_rate": 9.601005708026851e-07, + "loss": 0.1216, + "num_tokens": 457937788.0, + "reward": 1.09130859375, + "reward_std": 0.29480671882629395, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7431640625, - "rewards/tag_count_reward/std": 0.30958616733551025, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.17362910509109497, "step": 650 }, { @@ -18865,27 +18865,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.330078125, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 1201.9921875, - "completions/mean_terminated_length": 785.154541015625, - "completions/min_length": 267.0, - "completions/min_terminated_length": 267.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1061.111328125, + "completions/mean_terminated_length": 1020.9938354492188, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, "epoch": 0.2222411880174106, - "grad_norm": 3.7783749103546143, - "kl": 1.68359375, - "learning_rate": 9.596879695370894e-07, - "loss": 0.4232, - "num_tokens": 429373508.0, - "reward": 0.78271484375, - "reward_std": 0.38518527150154114, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, + "grad_norm": 10.762516021728516, + "kl": 0.40771484375, + "learning_rate": 9.59879498231628e-07, + "loss": 0.0941, + "num_tokens": 458554277.0, + "reward": 1.0830078125, + "reward_std": 0.24051667749881744, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.72021484375, - "rewards/tag_count_reward/std": 0.32427331805229187, + "rewards/tag_count_reward/mean": 0.9521484375, + "rewards/tag_count_reward/std": 0.15453127026557922, "step": 651 }, { @@ -18894,27 +18894,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.2890625, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1840.0, - "completions/mean_length": 1102.439453125, - "completions/mean_terminated_length": 717.9807739257812, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 990.31640625, + "completions/mean_terminated_length": 940.5684814453125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.2225825723308014, - "grad_norm": 5.931146144866943, - "kl": 1.54296875, - "learning_rate": 9.594658962337134e-07, - "loss": 0.4444, - "num_tokens": 430018293.0, - "reward": 0.81689453125, - "reward_std": 0.3459562659263611, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, + "grad_norm": 3.1867833137512207, + "kl": 0.333251953125, + "learning_rate": 9.596578434701198e-07, + "loss": 0.1013, + "num_tokens": 459141655.0, + "reward": 1.0732421875, + "reward_std": 0.25161483883857727, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.76416015625, - "rewards/tag_count_reward/std": 0.3099297285079956, + "rewards/tag_count_reward/mean": 0.9580078125, + "rewards/tag_count_reward/std": 0.1490161269903183, "step": 652 }, { @@ -18923,27 +18923,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.31640625, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 1171.966796875, - "completions/mean_terminated_length": 766.4885864257812, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1085.6171875, + "completions/mean_terminated_length": 1012.8319702148438, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, "epoch": 0.2229239566441922, - "grad_norm": 3.5709598064422607, - "kl": 1.8046875, - "learning_rate": 9.592432417683903e-07, - "loss": 0.4129, - "num_tokens": 430697892.0, - "reward": 0.7685546875, - "reward_std": 0.36317336559295654, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, + "grad_norm": 26.43825340270996, + "kl": 1.060546875, + "learning_rate": 9.594356068329975e-07, + "loss": 0.1706, + "num_tokens": 459777043.0, + "reward": 1.083984375, + "reward_std": 0.28108423948287964, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7041015625, - "rewards/tag_count_reward/std": 0.3233308494091034, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.1782301664352417, "step": 653 }, { @@ -18952,27 +18952,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.322265625, + "completions/clipped_ratio": 0.095703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1935.0, - "completions/mean_length": 1180.69140625, - "completions/mean_terminated_length": 768.2824096679688, - "completions/min_length": 215.0, - "completions/min_terminated_length": 215.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1111.92578125, + "completions/mean_terminated_length": 1012.8595581054688, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.223265340957583, - "grad_norm": 5.544654846191406, - "kl": 1.74609375, - "learning_rate": 9.59020006457137e-07, - "loss": 0.4349, - "num_tokens": 431384006.0, - "reward": 0.771484375, - "reward_std": 0.36828142404556274, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, + "grad_norm": 29024.78125, + "kl": 550.1171875, + "learning_rate": 9.592127886359247e-07, + "loss": 22.1737, + "num_tokens": 460427949.0, + "reward": 1.017578125, + "reward_std": 0.247486412525177, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.720703125, - "rewards/tag_count_reward/std": 0.3233736753463745, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.21882425248622894, "step": 654 }, { @@ -18981,27 +18981,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.275390625, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1902.0, - "completions/mean_length": 1096.169921875, - "completions/mean_terminated_length": 734.4231567382812, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1067.748046875, + "completions/mean_terminated_length": 998.0230102539062, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.2236067252709738, - "grad_norm": 6.116422176361084, - "kl": 2.353515625, - "learning_rate": 9.587961906167952e-07, - "loss": 0.4265, - "num_tokens": 432017821.0, - "reward": 0.8349609375, - "reward_std": 0.3506982922554016, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.0, - "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7607421875, - "rewards/tag_count_reward/std": 0.3090795576572418, + "grad_norm": 15.009438514709473, + "kl": 0.81396484375, + "learning_rate": 9.589893891953914e-07, + "loss": 0.1353, + "num_tokens": 461047212.0, + "reward": 1.0439453125, + "reward_std": 0.21697312593460083, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.18668025732040405, "step": 655 }, { @@ -19010,27 +19010,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.24609375, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1882.0, - "completions/mean_length": 1036.408203125, - "completions/mean_terminated_length": 706.199462890625, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 983.181640625, + "completions/mean_terminated_length": 883.070556640625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, "epoch": 0.2239481095843646, - "grad_norm": 6.848345756530762, - "kl": 2.25390625, - "learning_rate": 9.585717945650307e-07, - "loss": 0.4598, - "num_tokens": 432624126.0, - "reward": 0.89990234375, - "reward_std": 0.4077647030353546, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, + "grad_norm": 151.59271240234375, + "kl": 2.9228515625, + "learning_rate": 9.587654088287128e-07, + "loss": 0.2174, + "num_tokens": 461626265.0, + "reward": 1.1240234375, + "reward_std": 0.24472007155418396, + "rewards/accuracy_reward/mean": 0.197265625, + "rewards/accuracy_reward/std": 0.3983237147331238, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.75927734375, - "rewards/tag_count_reward/std": 0.31208041310310364, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.2075078934431076, "step": 656 }, { @@ -19039,27 +19039,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.31640625, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1953.0, - "completions/mean_length": 1143.375, - "completions/mean_terminated_length": 724.662841796875, - "completions/min_length": 196.0, - "completions/min_terminated_length": 196.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1095.560546875, + "completions/mean_terminated_length": 1021.3704833984375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, "epoch": 0.2242894938977554, - "grad_norm": 9.063117980957031, - "kl": 2.5, - "learning_rate": 9.583468186203326e-07, - "loss": 0.5099, - "num_tokens": 433281934.0, - "reward": 0.77880859375, - "reward_std": 0.3714064955711365, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, + "grad_norm": 2.1108622550964355, + "kl": 0.46875, + "learning_rate": 9.585408478540289e-07, + "loss": 0.1222, + "num_tokens": 462259592.0, + "reward": 1.07421875, + "reward_std": 0.2752722501754761, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.72216796875, - "rewards/tag_count_reward/std": 0.32217732071876526, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.2015109360218048, "step": 657 }, { @@ -19068,27 +19068,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.322265625, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1942.0, - "completions/mean_length": 1146.294921875, - "completions/mean_terminated_length": 717.5302124023438, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1022.576171875, + "completions/mean_terminated_length": 978.718994140625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, "epoch": 0.2246308782111462, - "grad_norm": 16.551767349243164, - "kl": 3.5390625, - "learning_rate": 9.581212631020132e-07, - "loss": 0.5075, - "num_tokens": 433937845.0, - "reward": 0.765625, - "reward_std": 0.35465750098228455, - "rewards/accuracy_reward/mean": 0.038306452333927155, - "rewards/accuracy_reward/std": 0.19212883710861206, - "rewards/format_reward/mean": 0.0, - "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.728515625, - "rewards/tag_count_reward/std": 0.3186587393283844, + "grad_norm": 1.516474962234497, + "kl": 0.1995849609375, + "learning_rate": 9.58315706590305e-07, + "loss": 0.0797, + "num_tokens": 462852159.0, + "reward": 1.09228515625, + "reward_std": 0.236845463514328, + "rewards/accuracy_reward/mean": 0.13306452333927155, + "rewards/accuracy_reward/std": 0.3399873673915863, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.96142578125, + "rewards/tag_count_reward/std": 0.13671186566352844, "step": 658 }, { @@ -19097,27 +19097,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.3046875, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1755.0, - "completions/mean_length": 1129.478515625, - "completions/mean_terminated_length": 726.9803466796875, - "completions/min_length": 193.0, - "completions/min_terminated_length": 193.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1110.794921875, + "completions/mean_terminated_length": 1033.52001953125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.224972262524537, - "grad_norm": 10.934162139892578, - "kl": 3.0546875, - "learning_rate": 9.578951283302072e-07, - "loss": 0.5226, - "num_tokens": 434597802.0, - "reward": 0.779296875, - "reward_std": 0.3694903254508972, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, + "grad_norm": 3.092843532562256, + "kl": 0.4921875, + "learning_rate": 9.580899853573308e-07, + "loss": 0.1366, + "num_tokens": 463502550.0, + "reward": 1.0341796875, + "reward_std": 0.23125649988651276, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.73046875, - "rewards/tag_count_reward/std": 0.3218393921852112, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.19123151898384094, "step": 659 }, { @@ -19126,27 +19126,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.263671875, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1107.08203125, - "completions/mean_terminated_length": 770.1484985351562, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1087.748046875, + "completions/mean_terminated_length": 1042.582763671875, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, "epoch": 0.2253136468379278, - "grad_norm": 10.84688949584961, - "kl": 3.5546875, - "learning_rate": 9.576684146258715e-07, - "loss": 0.4756, - "num_tokens": 435238276.0, - "reward": 0.771484375, - "reward_std": 0.3382296562194824, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, + "grad_norm": 1.9711774587631226, + "kl": 0.244873046875, + "learning_rate": 9.57863684475719e-07, + "loss": 0.079, + "num_tokens": 464133125.0, + "reward": 1.07763671875, + "reward_std": 0.26416558027267456, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.732421875, - "rewards/tag_count_reward/std": 0.3154278099536896, + "rewards/tag_count_reward/mean": 0.95458984375, + "rewards/tag_count_reward/std": 0.14924278855323792, "step": 660 }, { @@ -19155,27 +19155,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.205078125, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 1014.0, - "completions/mean_terminated_length": 747.2432250976562, - "completions/min_length": 229.0, - "completions/min_terminated_length": 229.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1025.240234375, + "completions/mean_terminated_length": 994.3721923828125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.2256550311513186, - "grad_norm": 9.455982208251953, - "kl": 3.6484375, - "learning_rate": 9.574411223107849e-07, - "loss": 0.4937, - "num_tokens": 435835508.0, - "reward": 0.82861328125, - "reward_std": 0.37730008363723755, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.75830078125, - "rewards/tag_count_reward/std": 0.30856090784072876, + "grad_norm": 3.0381991863250732, + "kl": 0.377685546875, + "learning_rate": 9.576368042669063e-07, + "loss": 0.0684, + "num_tokens": 464736112.0, + "reward": 1.0947265625, + "reward_std": 0.2287023365497589, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9599609375, + "rewards/tag_count_reward/std": 0.13673719763755798, "step": 661 }, { @@ -19184,27 +19184,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.17578125, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 984.41796875, - "completions/mean_terminated_length": 757.5877075195312, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 1065.873046875, + "completions/mean_terminated_length": 996.0146484375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.2259964154647094, - "grad_norm": 55.217586517333984, - "kl": 5.5546875, - "learning_rate": 9.572132517075472e-07, - "loss": 0.5353, - "num_tokens": 436417178.0, - "reward": 0.8818359375, - "reward_std": 0.38336285948753357, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, + "grad_norm": 4.670502662658691, + "kl": 0.5634765625, + "learning_rate": 9.574093450531518e-07, + "loss": 0.1339, + "num_tokens": 465359487.0, + "reward": 1.12451171875, + "reward_std": 0.2823469042778015, + "rewards/accuracy_reward/mean": 0.185546875, + "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7783203125, - "rewards/tag_count_reward/std": 0.29787030816078186, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.1798572540283203, "step": 662 }, { @@ -19213,27 +19213,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.154296875, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 998.3984375, - "completions/mean_terminated_length": 806.9006958007812, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1157.30859375, + "completions/mean_terminated_length": 1079.7750244140625, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, "epoch": 0.2263377997781002, - "grad_norm": 8.337776184082031, - "kl": 4.05859375, - "learning_rate": 9.56984803139579e-07, - "loss": 0.4669, - "num_tokens": 437012886.0, - "reward": 0.8271484375, - "reward_std": 0.3475971519947052, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, + "grad_norm": 18.447429656982422, + "kl": 1.0283203125, + "learning_rate": 9.571813071575375e-07, + "loss": 0.156, + "num_tokens": 466036557.0, + "reward": 1.00048828125, + "reward_std": 0.24942567944526672, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7841796875, - "rewards/tag_count_reward/std": 0.29185569286346436, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.18932512402534485, "step": 663 }, { @@ -19242,27 +19242,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.138671875, + "completions/clipped_ratio": 0.09765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 954.8515625, - "completions/mean_terminated_length": 778.857177734375, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1140.634765625, + "completions/mean_terminated_length": 1042.43505859375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.226679184091491, - "grad_norm": 17.617353439331055, - "kl": 4.45703125, - "learning_rate": 9.567557769311213e-07, - "loss": 0.4034, - "num_tokens": 437574442.0, - "reward": 0.8857421875, - "reward_std": 0.33483001589775085, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, + "grad_norm": 86.0719985961914, + "kl": 1.96484375, + "learning_rate": 9.56952690903967e-07, + "loss": 0.1846, + "num_tokens": 466693234.0, + "reward": 1.0400390625, + "reward_std": 0.2606220841407776, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8037109375, - "rewards/tag_count_reward/std": 0.27854791283607483, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.20621450245380402, "step": 664 }, { @@ -19271,27 +19271,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.076171875, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 884.009765625, - "completions/mean_terminated_length": 788.0359497070312, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1072.736328125, + "completions/mean_terminated_length": 1033.0914306640625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.2270205684048818, - "grad_norm": 14.468287467956543, - "kl": 4.265625, - "learning_rate": 9.565261734072346e-07, - "loss": 0.4193, - "num_tokens": 438108351.0, - "reward": 0.8193359375, - "reward_std": 0.32358604669570923, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, + "grad_norm": 2.3356974124908447, + "kl": 0.32177734375, + "learning_rate": 9.567234966171651e-07, + "loss": 0.0914, + "num_tokens": 467323771.0, + "reward": 1.02001953125, + "reward_std": 0.20084954798221588, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7939453125, - "rewards/tag_count_reward/std": 0.2884306311607361, + "rewards/tag_count_reward/mean": 0.95751953125, + "rewards/tag_count_reward/std": 0.1425827592611313, "step": 665 }, { @@ -19300,27 +19300,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.078125, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 837.427734375, - "completions/mean_terminated_length": 734.8368530273438, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1050.623046875, + "completions/mean_terminated_length": 997.265380859375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.2273619527182726, - "grad_norm": 12.217328071594238, - "kl": 3.38671875, - "learning_rate": 9.562959928937999e-07, - "loss": 0.3894, - "num_tokens": 438616250.0, - "reward": 0.8662109375, - "reward_std": 0.3066549599170685, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, + "grad_norm": 12.463715553283691, + "kl": 0.55224609375, + "learning_rate": 9.564937246226787e-07, + "loss": 0.0961, + "num_tokens": 467940826.0, + "reward": 1.06591796875, + "reward_std": 0.21576757729053497, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8232421875, - "rewards/tag_count_reward/std": 0.27226102352142334, + "rewards/tag_count_reward/mean": 0.95263671875, + "rewards/tag_count_reward/std": 0.16426876187324524, "step": 666 }, { @@ -19329,27 +19329,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 795.8984375, - "completions/mean_terminated_length": 723.4627685546875, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1061.53515625, + "completions/mean_terminated_length": 989.1530151367188, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.2277033370316634, - "grad_norm": 8.62721061706543, - "kl": 3.6328125, - "learning_rate": 9.560652357175157e-07, - "loss": 0.3485, - "num_tokens": 439102582.0, - "reward": 0.93359375, - "reward_std": 0.3151872754096985, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, + "grad_norm": 2.1825854778289795, + "kl": 0.297119140625, + "learning_rate": 9.562633752468744e-07, + "loss": 0.1005, + "num_tokens": 468563164.0, + "reward": 1.06640625, + "reward_std": 0.20834612846374512, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.841796875, - "rewards/tag_count_reward/std": 0.2600637674331665, + "rewards/tag_count_reward/mean": 0.951171875, + "rewards/tag_count_reward/std": 0.15501591563224792, "step": 667 }, { @@ -19358,27 +19358,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 775.390625, - "completions/mean_terminated_length": 723.6585083007812, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 995.908203125, + "completions/mean_terminated_length": 948.67138671875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.22804472134505419, - "grad_norm": 12.339641571044922, - "kl": 3.54296875, - "learning_rate": 9.558339022058995e-07, - "loss": 0.3654, - "num_tokens": 439572894.0, - "reward": 0.921875, - "reward_std": 0.3119737505912781, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, + "grad_norm": 1.2705602645874023, + "kl": 0.23828125, + "learning_rate": 9.560324488169387e-07, + "loss": 0.0693, + "num_tokens": 469146381.0, + "reward": 1.1171875, + "reward_std": 0.2328583300113678, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83984375, - "rewards/tag_count_reward/std": 0.25744181871414185, + "rewards/tag_count_reward/mean": 0.958984375, + "rewards/tag_count_reward/std": 0.14513419568538666, "step": 668 }, { @@ -19387,27 +19387,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1943.0, - "completions/mean_length": 812.435546875, - "completions/mean_terminated_length": 762.2093505859375, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 963.025390625, + "completions/mean_terminated_length": 921.2109375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.228386105658445, - "grad_norm": 15.873409271240234, - "kl": 3.3125, - "learning_rate": 9.556019926872874e-07, - "loss": 0.3952, - "num_tokens": 440066541.0, - "reward": 0.8818359375, - "reward_std": 0.3047889173030853, - "rewards/accuracy_reward/mean": 0.060483869165182114, - "rewards/accuracy_reward/std": 0.2386218160390854, + "grad_norm": 2.1494147777557373, + "kl": 0.20263671875, + "learning_rate": 9.558009456608786e-07, + "loss": 0.0783, + "num_tokens": 469717130.0, + "reward": 1.05810546875, + "reward_std": 0.19948862493038177, + "rewards/accuracy_reward/mean": 0.10080645233392715, + "rewards/accuracy_reward/std": 0.30137622356414795, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8232421875, - "rewards/tag_count_reward/std": 0.27181142568588257, + "rewards/tag_count_reward/mean": 0.96044921875, + "rewards/tag_count_reward/std": 0.1382133513689041, "step": 669 }, { @@ -19416,27 +19416,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 736.40625, - "completions/mean_terminated_length": 707.6087646484375, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1002.859375, + "completions/mean_terminated_length": 960.3739624023438, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.22872748997183578, - "grad_norm": 5.676901340484619, - "kl": 3.48046875, - "learning_rate": 9.553695074908321e-07, - "loss": 0.2961, - "num_tokens": 440525437.0, - "reward": 0.94775390625, - "reward_std": 0.3289104402065277, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.84619140625, - "rewards/tag_count_reward/std": 0.25632917881011963, + "grad_norm": 0.9131292104721069, + "kl": 0.195068359375, + "learning_rate": 9.555688661075198e-07, + "loss": 0.0574, + "num_tokens": 470312450.0, + "reward": 1.146484375, + "reward_std": 0.2447153776884079, + "rewards/accuracy_reward/mean": 0.177734375, + "rewards/accuracy_reward/std": 0.3826628625392914, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.1308559775352478, "step": 670 }, { @@ -19445,27 +19445,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 722.0, - "completions/mean_terminated_length": 681.9798583984375, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 874.861328125, + "completions/mean_terminated_length": 841.8814697265625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.2290688742852266, - "grad_norm": 5.196404933929443, - "kl": 4.921875, - "learning_rate": 9.55136446946504e-07, - "loss": 0.4483, - "num_tokens": 440963101.0, - "reward": 0.91064453125, - "reward_std": 0.3597055971622467, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.81298828125, - "rewards/tag_count_reward/std": 0.27857664227485657, + "grad_norm": 1.7777296304702759, + "kl": 0.1478271484375, + "learning_rate": 9.553362104865063e-07, + "loss": 0.0553, + "num_tokens": 470828379.0, + "reward": 1.16552734375, + "reward_std": 0.24790939688682556, + "rewards/accuracy_reward/mean": 0.197265625, + "rewards/accuracy_reward/std": 0.3983237147331238, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96826171875, + "rewards/tag_count_reward/std": 0.1274217665195465, "step": 671 }, { @@ -19474,27 +19474,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 827.990234375, - "completions/mean_terminated_length": 783.5364379882812, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1031.810546875, + "completions/mean_terminated_length": 986.1856689453125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, "epoch": 0.22941025859861738, - "grad_norm": 23.459131240844727, - "kl": 7.03125, - "learning_rate": 9.549028113850903e-07, - "loss": 0.4704, - "num_tokens": 441464904.0, - "reward": 0.775390625, - "reward_std": 0.33348000049591064, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, + "grad_norm": 1.654274821281433, + "kl": 0.289306640625, + "learning_rate": 9.551029791283014e-07, + "loss": 0.0718, + "num_tokens": 471434538.0, + "reward": 1.0263671875, + "reward_std": 0.19223755598068237, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.73828125, - "rewards/tag_count_reward/std": 0.3066602051258087, + "rewards/tag_count_reward/mean": 0.9580078125, + "rewards/tag_count_reward/std": 0.14315544068813324, "step": 672 }, { @@ -19503,27 +19503,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 785.119140625, - "completions/mean_terminated_length": 747.0040283203125, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1033.25390625, + "completions/mean_terminated_length": 996.2793579101562, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, "epoch": 0.2297516429120082, - "grad_norm": 21.86775779724121, - "kl": 7.875, - "learning_rate": 9.54668601138193e-07, - "loss": 0.5296, - "num_tokens": 441944709.0, - "reward": 0.828125, - "reward_std": 0.33433833718299866, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, + "grad_norm": 14.117053985595703, + "kl": 0.67431640625, + "learning_rate": 9.548691723641851e-07, + "loss": 0.0823, + "num_tokens": 472041388.0, + "reward": 1.1025390625, + "reward_std": 0.23748698830604553, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7734375, - "rewards/tag_count_reward/std": 0.28228431940078735, + "rewards/tag_count_reward/mean": 0.9619140625, + "rewards/tag_count_reward/std": 0.1408134400844574, "step": 673 }, { @@ -19532,27 +19532,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 785.490234375, - "completions/mean_terminated_length": 744.7640991210938, - "completions/min_length": 42.0, - "completions/min_terminated_length": 42.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 989.890625, + "completions/mean_terminated_length": 957.9556884765625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, "epoch": 0.23009302722539898, - "grad_norm": 13.236749649047852, - "kl": 7.3671875, - "learning_rate": 9.544338165382318e-07, - "loss": 0.5494, - "num_tokens": 442423712.0, - "reward": 0.7939453125, - "reward_std": 0.33076488971710205, - "rewards/accuracy_reward/mean": 0.06653226166963577, - "rewards/accuracy_reward/std": 0.24946178495883942, + "grad_norm": 1.6093616485595703, + "kl": 0.27734375, + "learning_rate": 9.546347905262556e-07, + "loss": 0.0896, + "num_tokens": 472625044.0, + "reward": 1.064453125, + "reward_std": 0.20218700170516968, + "rewards/accuracy_reward/mean": 0.10685484111309052, + "rewards/accuracy_reward/std": 0.3092404901981354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7294921875, - "rewards/tag_count_reward/std": 0.29851120710372925, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1361243724822998, "step": 674 }, { @@ -19561,27 +19561,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 800.494140625, - "completions/mean_terminated_length": 765.4236450195312, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1101.912109375, + "completions/mean_terminated_length": 1028.216796875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, "epoch": 0.2304344115387898, - "grad_norm": 10.939888954162598, - "kl": 6.359375, - "learning_rate": 9.541984579184399e-07, - "loss": 0.4581, - "num_tokens": 442914301.0, - "reward": 0.7734375, - "reward_std": 0.35030168294906616, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, + "grad_norm": 2.2166543006896973, + "kl": 0.5693359375, + "learning_rate": 9.543998339474272e-07, + "loss": 0.1435, + "num_tokens": 473269959.0, + "reward": 1.0322265625, + "reward_std": 0.2642326056957245, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.724609375, - "rewards/tag_count_reward/std": 0.3034211993217468, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.19616466760635376, "step": 675 }, { @@ -19590,27 +19590,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1940.0, - "completions/mean_length": 797.484375, - "completions/mean_terminated_length": 759.742431640625, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1068.87890625, + "completions/mean_terminated_length": 1010.0911254882812, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.23077579585218058, - "grad_norm": 4.543842792510986, - "kl": 4.94140625, - "learning_rate": 9.539625256128658e-07, - "loss": 0.3953, - "num_tokens": 443401701.0, - "reward": 0.80859375, - "reward_std": 0.3620191514492035, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.7265625, - "rewards/tag_count_reward/std": 0.29747387766838074, + "grad_norm": 9.61108112335205, + "kl": 0.51708984375, + "learning_rate": 9.541643029614309e-07, + "loss": 0.0981, + "num_tokens": 473896313.0, + "reward": 1.07958984375, + "reward_std": 0.24111506342887878, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95068359375, + "rewards/tag_count_reward/std": 0.15207155048847198, "step": 676 }, { @@ -19619,27 +19619,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1967.0, - "completions/mean_length": 751.935546875, - "completions/mean_terminated_length": 726.1175537109375, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1002.8359375, + "completions/mean_terminated_length": 966.9414672851562, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.2311171801655714, - "grad_norm": 5.480804920196533, - "kl": 4.234375, - "learning_rate": 9.537260199563723e-07, - "loss": 0.3736, - "num_tokens": 443870196.0, - "reward": 0.8095703125, - "reward_std": 0.3241175413131714, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, + "grad_norm": 3.052957534790039, + "kl": 0.48681640625, + "learning_rate": 9.539281979028132e-07, + "loss": 0.0844, + "num_tokens": 474493269.0, + "reward": 1.041015625, + "reward_std": 0.17344260215759277, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7763671875, - "rewards/tag_count_reward/std": 0.28289130330085754, + "rewards/tag_count_reward/mean": 0.962890625, + "rewards/tag_count_reward/std": 0.12740769982337952, "step": 677 }, { @@ -19648,27 +19648,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 738.060546875, - "completions/mean_terminated_length": 717.2678833007812, - "completions/min_length": 46.0, - "completions/min_terminated_length": 46.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 999.80078125, + "completions/mean_terminated_length": 950.4989624023438, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.23145856447896218, - "grad_norm": 8.061043739318848, - "kl": 3.640625, - "learning_rate": 9.534889412846361e-07, - "loss": 0.324, - "num_tokens": 444319731.0, - "reward": 0.86083984375, - "reward_std": 0.33779412508010864, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, + "grad_norm": 5.525875568389893, + "kl": 0.7158203125, + "learning_rate": 9.53691519106937e-07, + "loss": 0.1567, + "num_tokens": 475076815.0, + "reward": 1.07666015625, + "reward_std": 0.2343330681324005, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.79833984375, - "rewards/tag_count_reward/std": 0.2788749039173126, + "rewards/tag_count_reward/mean": 0.95556640625, + "rewards/tag_count_reward/std": 0.1535721719264984, "step": 678 }, { @@ -19677,27 +19677,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 767.890625, - "completions/mean_terminated_length": 737.1680297851562, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1035.986328125, + "completions/mean_terminated_length": 972.9979858398438, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, "epoch": 0.231799948792353, - "grad_norm": 10.127677917480469, - "kl": 3.7421875, - "learning_rate": 9.532512899341467e-07, - "loss": 0.3546, - "num_tokens": 444791643.0, - "reward": 0.859375, - "reward_std": 0.3290579319000244, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, + "grad_norm": 5.435208797454834, + "kl": 0.623046875, + "learning_rate": 9.534542669099792e-07, + "loss": 0.1484, + "num_tokens": 475685992.0, + "reward": 1.0361328125, + "reward_std": 0.24256518483161926, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.791015625, - "rewards/tag_count_reward/std": 0.28070247173309326, + "rewards/tag_count_reward/mean": 0.9462890625, + "rewards/tag_count_reward/std": 0.1784525215625763, "step": 679 }, { @@ -19706,27 +19706,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.005859375, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 734.447265625, - "completions/mean_terminated_length": 726.705322265625, - "completions/min_length": 42.0, - "completions/min_terminated_length": 42.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1027.701171875, + "completions/mean_terminated_length": 948.2252197265625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.23214133310574378, - "grad_norm": 10.456915855407715, - "kl": 3.46875, - "learning_rate": 9.53013066242207e-07, - "loss": 0.3382, - "num_tokens": 445253072.0, - "reward": 0.88427734375, - "reward_std": 0.32650578022003174, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, + "grad_norm": 3.4324328899383545, + "kl": 0.7021484375, + "learning_rate": 9.532164416489314e-07, + "loss": 0.1585, + "num_tokens": 476297567.0, + "reward": 1.03466796875, + "reward_std": 0.24270811676979065, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.81005859375, - "rewards/tag_count_reward/std": 0.2712250053882599, + "rewards/tag_count_reward/mean": 0.92919921875, + "rewards/tag_count_reward/std": 0.19781571626663208, "step": 680 }, { @@ -19735,27 +19735,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1891.0, - "completions/mean_length": 718.912109375, - "completions/mean_terminated_length": 705.8047485351562, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 973.802734375, + "completions/mean_terminated_length": 904.5717163085938, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.2324827174191346, - "grad_norm": 3.8908488750457764, - "kl": 4.5078125, - "learning_rate": 9.527742705469318e-07, - "loss": 0.3677, - "num_tokens": 445695043.0, - "reward": 0.900390625, - "reward_std": 0.36471623182296753, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.798828125, - "rewards/tag_count_reward/std": 0.2785702049732208, + "grad_norm": 10.406401634216309, + "kl": 1.11474609375, + "learning_rate": 9.529780436615992e-07, + "loss": 0.1538, + "num_tokens": 476870042.0, + "reward": 1.10791015625, + "reward_std": 0.2906540632247925, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.18686124682426453, "step": 681 }, { @@ -19764,27 +19764,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 735.57421875, - "completions/mean_terminated_length": 722.6311645507812, - "completions/min_length": 89.0, - "completions/min_terminated_length": 89.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 993.78515625, + "completions/mean_terminated_length": 937.3867797851562, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.23282410173252538, - "grad_norm": 4.040297985076904, - "kl": 5.4765625, - "learning_rate": 9.525349031872481e-07, - "loss": 0.3961, - "num_tokens": 446147449.0, - "reward": 0.80419921875, - "reward_std": 0.3184512257575989, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, + "grad_norm": 4.53196382522583, + "kl": 0.9306640625, + "learning_rate": 9.527390732866016e-07, + "loss": 0.1547, + "num_tokens": 477454652.0, + "reward": 1.083984375, + "reward_std": 0.23493245244026184, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.76904296875, - "rewards/tag_count_reward/std": 0.28022506833076477, + "rewards/tag_count_reward/mean": 0.947265625, + "rewards/tag_count_reward/std": 0.1659708470106125, "step": 682 }, { @@ -19793,27 +19793,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 717.15625, - "completions/mean_terminated_length": 693.3439331054688, - "completions/min_length": 32.0, - "completions/min_terminated_length": 32.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1071.419921875, + "completions/mean_terminated_length": 977.31689453125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.2331654860459162, - "grad_norm": 4.727411270141602, - "kl": 5.390625, - "learning_rate": 9.52294964502894e-07, - "loss": 0.3808, - "num_tokens": 446591257.0, - "reward": 0.7978515625, - "reward_std": 0.33729878067970276, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, + "grad_norm": 12.178365707397461, + "kl": 1.2666015625, + "learning_rate": 9.52499530863371e-07, + "loss": 0.1879, + "num_tokens": 478079843.0, + "reward": 1.0439453125, + "reward_std": 0.2553746700286865, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7373046875, - "rewards/tag_count_reward/std": 0.2872621417045593, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.20262455940246582, "step": 683 }, { @@ -19822,27 +19822,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.00390625, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 715.095703125, - "completions/mean_terminated_length": 709.86865234375, - "completions/min_length": 46.0, - "completions/min_terminated_length": 46.0, + "completions/max_terminated_length": 1924.0, + "completions/mean_length": 1027.55078125, + "completions/mean_terminated_length": 968.5164794921875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.23350687035930698, - "grad_norm": 3.564438581466675, - "kl": 5.296875, - "learning_rate": 9.520544548344184e-07, - "loss": 0.3726, - "num_tokens": 447031610.0, - "reward": 0.84130859375, - "reward_std": 0.3505712151527405, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, + "grad_norm": 5.280699729919434, + "kl": 0.9677734375, + "learning_rate": 9.522594167321519e-07, + "loss": 0.1295, + "num_tokens": 478680173.0, + "reward": 1.0732421875, + "reward_std": 0.2343463897705078, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.76513671875, - "rewards/tag_count_reward/std": 0.2847912311553955, + "rewards/tag_count_reward/mean": 0.9521484375, + "rewards/tag_count_reward/std": 0.1584392786026001, "step": 684 }, { @@ -19851,27 +19851,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1864.0, - "completions/mean_length": 754.017578125, - "completions/mean_terminated_length": 743.8287353515625, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_terminated_length": 1907.0, + "completions/mean_length": 1113.02734375, + "completions/mean_terminated_length": 1040.1978759765625, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, "epoch": 0.2338482546726978, - "grad_norm": 9.209820747375488, - "kl": 5.546875, - "learning_rate": 9.51813374523181e-07, - "loss": 0.3173, - "num_tokens": 447496787.0, - "reward": 0.80517578125, - "reward_std": 0.3297889232635498, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, + "grad_norm": 26.820039749145508, + "kl": 1.2861328125, + "learning_rate": 9.520187312340011e-07, + "loss": 0.1656, + "num_tokens": 479329163.0, + "reward": 1.0380859375, + "reward_std": 0.25632888078689575, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.76220703125, - "rewards/tag_count_reward/std": 0.2806067168712616, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.1873546987771988, "step": 685 }, { @@ -19880,27 +19880,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 728.1796875, - "completions/mean_terminated_length": 712.5296630859375, - "completions/min_length": 95.0, - "completions/min_terminated_length": 95.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1055.44921875, + "completions/mean_terminated_length": 959.8072509765625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.23418963898608858, - "grad_norm": 9.223282814025879, - "kl": 6.0546875, - "learning_rate": 9.515717239113511e-07, - "loss": 0.3745, - "num_tokens": 447945871.0, - "reward": 0.78271484375, - "reward_std": 0.36198490858078003, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.72998046875, - "rewards/tag_count_reward/std": 0.303219199180603, + "grad_norm": 3.7987263202667236, + "kl": 0.96044921875, + "learning_rate": 9.517774747107868e-07, + "loss": 0.1271, + "num_tokens": 479945809.0, + "reward": 1.05322265625, + "reward_std": 0.281509131193161, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.20810279250144958, "step": 686 }, { @@ -19909,27 +19909,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1909.0, - "completions/mean_length": 728.1484375, - "completions/mean_terminated_length": 717.7559204101562, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1050.935546875, + "completions/mean_terminated_length": 982.2442626953125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, "epoch": 0.2345310232994794, - "grad_norm": 3.5982277393341064, - "kl": 4.69140625, - "learning_rate": 9.513295033419077e-07, - "loss": 0.33, - "num_tokens": 448398059.0, - "reward": 0.81884765625, - "reward_std": 0.3465069532394409, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, + "grad_norm": 5.604643821716309, + "kl": 0.38330078125, + "learning_rate": 9.515356475051884e-07, + "loss": 0.1541, + "num_tokens": 480563264.0, + "reward": 1.07568359375, + "reward_std": 0.28175023198127747, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.76806640625, - "rewards/tag_count_reward/std": 0.29015275835990906, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.17773018777370453, "step": 687 }, { @@ -19938,27 +19938,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 754.5390625, - "completions/mean_terminated_length": 744.3543090820312, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1068.994140625, + "completions/mean_terminated_length": 1018.7371826171875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.23487240761287018, - "grad_norm": 3.5190393924713135, - "kl": 4.75390625, - "learning_rate": 9.510867131586383e-07, - "loss": 0.3383, - "num_tokens": 448873087.0, - "reward": 0.77197265625, - "reward_std": 0.3334955871105194, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.73876953125, - "rewards/tag_count_reward/std": 0.3012463450431824, + "grad_norm": 4.438237190246582, + "kl": 0.294921875, + "learning_rate": 9.512932499606957e-07, + "loss": 0.0731, + "num_tokens": 481199293.0, + "reward": 1.033203125, + "reward_std": 0.2095925211906433, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.951171875, + "rewards/tag_count_reward/std": 0.15101934969425201, "step": 688 }, { @@ -19967,27 +19967,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.005859375, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1997.0, - "completions/mean_length": 736.884765625, - "completions/mean_terminated_length": 729.1572265625, - "completions/min_length": 66.0, - "completions/min_terminated_length": 66.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1045.1640625, + "completions/mean_terminated_length": 987.1487426757812, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.235213791926261, - "grad_norm": 4.137765407562256, - "kl": 4.7109375, - "learning_rate": 9.508433537061394e-07, - "loss": 0.3412, - "num_tokens": 449330356.0, - "reward": 0.78125, - "reward_std": 0.31054240465164185, - "rewards/accuracy_reward/mean": 0.02217741869390011, - "rewards/accuracy_reward/std": 0.14740893244743347, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.7578125, - "rewards/tag_count_reward/std": 0.29499658942222595, + "grad_norm": 6.629810333251953, + "kl": 0.36962890625, + "learning_rate": 9.510502824216091e-07, + "loss": 0.1187, + "num_tokens": 481814401.0, + "reward": 0.9970703125, + "reward_std": 0.19589579105377197, + "rewards/accuracy_reward/mean": 0.05443548411130905, + "rewards/accuracy_reward/std": 0.227104052901268, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.1743793785572052, "step": 689 }, { @@ -19996,27 +19996,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1898.0, - "completions/mean_length": 778.779296875, - "completions/mean_terminated_length": 740.4728393554688, - "completions/min_length": 67.0, - "completions/min_terminated_length": 67.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 990.037109375, + "completions/mean_terminated_length": 944.7882080078125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, "epoch": 0.23555517623965178, - "grad_norm": 4.3230671882629395, - "kl": 5.7734375, - "learning_rate": 9.505994253298152e-07, - "loss": 0.4177, - "num_tokens": 449808403.0, - "reward": 0.79150390625, - "reward_std": 0.35842466354370117, - "rewards/accuracy_reward/mean": 0.060483869165182114, - "rewards/accuracy_reward/std": 0.2386218160390854, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.73095703125, - "rewards/tag_count_reward/std": 0.30003857612609863, + "grad_norm": 7.432539463043213, + "kl": 0.586669921875, + "learning_rate": 9.50806745233038e-07, + "loss": 0.0801, + "num_tokens": 482400612.0, + "reward": 1.09814453125, + "reward_std": 0.20601269602775574, + "rewards/accuracy_reward/mean": 0.14717741310596466, + "rewards/accuracy_reward/std": 0.354640394449234, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95556640625, + "rewards/tag_count_reward/std": 0.14871685206890106, "step": 690 }, { @@ -20025,27 +20025,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1843.0, - "completions/mean_length": 707.78125, - "completions/mean_terminated_length": 694.5640869140625, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 1058.2734375, + "completions/mean_terminated_length": 990.0877075195312, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.2358965605530426, - "grad_norm": 2.4250447750091553, - "kl": 4.6171875, - "learning_rate": 9.503549283758773e-07, - "loss": 0.3016, - "num_tokens": 450247379.0, - "reward": 0.80224609375, - "reward_std": 0.3462855815887451, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, + "grad_norm": 4.2346110343933105, + "kl": 0.42919921875, + "learning_rate": 9.505626387409013e-07, + "loss": 0.0859, + "num_tokens": 483019040.0, + "reward": 1.0830078125, + "reward_std": 0.2561722993850708, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.75146484375, - "rewards/tag_count_reward/std": 0.280868798494339, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.1672183722257614, "step": 691 }, { @@ -20054,27 +20054,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 724.677734375, - "completions/mean_terminated_length": 708.9862060546875, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1020.44921875, + "completions/mean_terminated_length": 972.1185913085938, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.23623794486643337, - "grad_norm": 5.178401470184326, - "kl": 5.0703125, - "learning_rate": 9.501098631913446e-07, - "loss": 0.3659, - "num_tokens": 450699214.0, - "reward": 0.79150390625, - "reward_std": 0.3480370044708252, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.74853515625, - "rewards/tag_count_reward/std": 0.29488927125930786, + "grad_norm": 19.660938262939453, + "kl": 0.873779296875, + "learning_rate": 9.503179632919265e-07, + "loss": 0.147, + "num_tokens": 483622310.0, + "reward": 1.04638671875, + "reward_std": 0.23474785685539246, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95263671875, + "rewards/tag_count_reward/std": 0.16426876187324524, "step": 692 }, { @@ -20083,27 +20083,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1936.0, - "completions/mean_length": 800.224609375, - "completions/mean_terminated_length": 765.1465454101562, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1094.1171875, + "completions/mean_terminated_length": 1028.40087890625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, "epoch": 0.2365793291798242, - "grad_norm": 2.476116895675659, - "kl": 5.4140625, - "learning_rate": 9.498642301240422e-07, - "loss": 0.3614, - "num_tokens": 451178609.0, - "reward": 0.78857421875, - "reward_std": 0.3727818727493286, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.72998046875, - "rewards/tag_count_reward/std": 0.29997488856315613, + "grad_norm": 6.599265098571777, + "kl": 0.62451171875, + "learning_rate": 9.500727192336488e-07, + "loss": 0.1423, + "num_tokens": 484252178.0, + "reward": 1.09814453125, + "reward_std": 0.2637082636356354, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94580078125, + "rewards/tag_count_reward/std": 0.17308342456817627, "step": 693 }, { @@ -20112,27 +20112,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 809.4296875, - "completions/mean_terminated_length": 787.2683715820312, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1127.314453125, + "completions/mean_terminated_length": 1057.682861328125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.23692071349321497, - "grad_norm": 2.4830360412597656, - "kl": 5.6953125, - "learning_rate": 9.496180295226012e-07, - "loss": 0.3686, - "num_tokens": 451665549.0, - "reward": 0.7900390625, - "reward_std": 0.34522438049316406, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.7412109375, - "rewards/tag_count_reward/std": 0.2937224805355072, + "grad_norm": 3.1476166248321533, + "kl": 0.90185546875, + "learning_rate": 9.49826906914412e-07, + "loss": 0.0944, + "num_tokens": 484901875.0, + "reward": 1.0361328125, + "reward_std": 0.2459559589624405, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.19740772247314453, "step": 694 }, { @@ -20141,27 +20141,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 754.28515625, - "completions/mean_terminated_length": 715.2394409179688, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 967.2578125, + "completions/mean_terminated_length": 907.0928344726562, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.2372620978066058, - "grad_norm": 5.1372833251953125, - "kl": 5.6953125, - "learning_rate": 9.493712617364585e-07, - "loss": 0.3957, - "num_tokens": 452125343.0, - "reward": 0.81884765625, - "reward_std": 0.371033638715744, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 23.895429611206055, + "kl": 1.2744140625, + "learning_rate": 9.495805266833661e-07, + "loss": 0.1372, + "num_tokens": 485470711.0, + "reward": 1.12841796875, + "reward_std": 0.2886299192905426, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.75244140625, - "rewards/tag_count_reward/std": 0.2985924184322357, + "rewards/tag_count_reward/mean": 0.94091796875, + "rewards/tag_count_reward/std": 0.1750049889087677, "step": 695 }, { @@ -20170,27 +20170,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 769.015625, - "completions/mean_terminated_length": 711.591796875, - "completions/min_length": 7.0, - "completions/min_terminated_length": 7.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 997.763671875, + "completions/mean_terminated_length": 915.9557495117188, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.23760348211999657, - "grad_norm": 7.955462455749512, - "kl": 7.3828125, - "learning_rate": 9.491239271158558e-07, - "loss": 0.4834, - "num_tokens": 452598519.0, - "reward": 0.78466796875, - "reward_std": 0.38856804370880127, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, + "grad_norm": 5.85207986831665, + "kl": 1.0927734375, + "learning_rate": 9.493335788904683e-07, + "loss": 0.1868, + "num_tokens": 486061006.0, + "reward": 1.07861328125, + "reward_std": 0.3283153474330902, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.72216796875, - "rewards/tag_count_reward/std": 0.30541715025901794, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.20810279250144958, "step": 696 }, { @@ -20199,27 +20199,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 771.4140625, - "completions/mean_terminated_length": 714.097900390625, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 1021.322265625, + "completions/mean_terminated_length": 936.670166015625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.2379448664333874, - "grad_norm": 5.789423942565918, - "kl": 6.9609375, - "learning_rate": 9.488760260118393e-07, - "loss": 0.4788, - "num_tokens": 453069643.0, - "reward": 0.82373046875, - "reward_std": 0.3601980209350586, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.73974609375, - "rewards/tag_count_reward/std": 0.29760515689849854, + "grad_norm": 8.984502792358398, + "kl": 0.71875, + "learning_rate": 9.490860638864818e-07, + "loss": 0.1274, + "num_tokens": 486660083.0, + "reward": 1.08544921875, + "reward_std": 0.2907658517360687, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.20523154735565186, "step": 697 }, { @@ -20228,27 +20228,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 814.333984375, - "completions/mean_terminated_length": 758.9448852539062, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1088.578125, + "completions/mean_terminated_length": 993.8712158203125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, "epoch": 0.23828625074677817, - "grad_norm": 4.101593971252441, - "kl": 6.53125, - "learning_rate": 9.486275587762592e-07, - "loss": 0.463, - "num_tokens": 453560966.0, - "reward": 0.75732421875, - "reward_std": 0.32711267471313477, - "rewards/accuracy_reward/mean": 0.0234375, - "rewards/accuracy_reward/std": 0.15143637359142303, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.73193359375, - "rewards/tag_count_reward/std": 0.3033420741558075, + "grad_norm": 9.338218688964844, + "kl": 0.66015625, + "learning_rate": 9.488379820229755e-07, + "loss": 0.1452, + "num_tokens": 487291819.0, + "reward": 0.990234375, + "reward_std": 0.23613004386425018, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.20983639359474182, "step": 698 }, { @@ -20257,27 +20257,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 842.97265625, - "completions/mean_terminated_length": 765.3097534179688, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1098.69140625, + "completions/mean_terminated_length": 1009.4402465820312, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.23862763506016899, - "grad_norm": 2.990671157836914, - "kl": 6.6953125, - "learning_rate": 9.483785257617695e-07, - "loss": 0.4557, - "num_tokens": 454064728.0, - "reward": 0.82080078125, - "reward_std": 0.3680709898471832, - "rewards/accuracy_reward/mean": 0.06451612710952759, - "rewards/accuracy_reward/std": 0.2459181249141693, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.75439453125, - "rewards/tag_count_reward/std": 0.2872966229915619, + "grad_norm": 5.446010589599609, + "kl": 0.82421875, + "learning_rate": 9.485893336523233e-07, + "loss": 0.1365, + "num_tokens": 487926509.0, + "reward": 1.029296875, + "reward_std": 0.3105788826942444, + "rewards/accuracy_reward/mean": 0.12096773833036423, + "rewards/accuracy_reward/std": 0.32641899585723877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.2197219431400299, "step": 699 }, { @@ -20286,27 +20286,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 770.794921875, - "completions/mean_terminated_length": 713.4509887695312, - "completions/min_length": 41.0, - "completions/min_terminated_length": 41.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 985.19921875, + "completions/mean_terminated_length": 911.9791259765625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.23896901937355977, - "grad_norm": 5.022763729095459, - "kl": 5.5, - "learning_rate": 9.48128927321827e-07, - "loss": 0.4159, - "num_tokens": 454530015.0, - "reward": 0.8564453125, - "reward_std": 0.3755378723144531, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.7666015625, - "rewards/tag_count_reward/std": 0.28363341093063354, + "grad_norm": 1068.2701416015625, + "kl": 22.4677734375, + "learning_rate": 9.483401191277038e-07, + "loss": 1.0403, + "num_tokens": 488501571.0, + "reward": 1.10205078125, + "reward_std": 0.3101636469364166, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.18422965705394745, "step": 700 }, { @@ -20315,27 +20315,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.078125, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 848.177734375, - "completions/mean_terminated_length": 746.4978637695312, - "completions/min_length": 36.0, - "completions/min_terminated_length": 36.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1136.693359375, + "completions/mean_terminated_length": 1024.778564453125, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, "epoch": 0.23931040368695058, - "grad_norm": 9.848064422607422, - "kl": 5.9140625, - "learning_rate": 9.478787638106908e-07, - "loss": 0.4741, - "num_tokens": 455043306.0, - "reward": 0.869140625, - "reward_std": 0.3720466196537018, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.7890625, - "rewards/tag_count_reward/std": 0.2814164161682129, + "grad_norm": 40.02294158935547, + "kl": 2.693359375, + "learning_rate": 9.480903388031002e-07, + "loss": 0.274, + "num_tokens": 489162582.0, + "reward": 1.056640625, + "reward_std": 0.32342439889907837, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.2392183393239975, "step": 701 }, { @@ -20344,27 +20344,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.078125, + "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 802.310546875, - "completions/mean_terminated_length": 696.74365234375, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1017.47265625, + "completions/mean_terminated_length": 900.9782104492188, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, "epoch": 0.2396517880003414, - "grad_norm": 10.3944673538208, - "kl": 6.125, - "learning_rate": 9.476280355834224e-07, - "loss": 0.5099, - "num_tokens": 455537001.0, - "reward": 0.818359375, - "reward_std": 0.3636171817779541, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.763671875, - "rewards/tag_count_reward/std": 0.29228267073631287, + "grad_norm": 18.939517974853516, + "kl": 1.6015625, + "learning_rate": 9.478399930332987e-07, + "loss": 0.1901, + "num_tokens": 489766440.0, + "reward": 1.0146484375, + "reward_std": 0.2792699933052063, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9013671875, + "rewards/tag_count_reward/std": 0.22994530200958252, "step": 702 }, { @@ -20373,27 +20373,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0703125, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 817.974609375, - "completions/mean_terminated_length": 724.947509765625, - "completions/min_length": 69.0, - "completions/min_terminated_length": 69.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 1121.630859375, + "completions/mean_terminated_length": 1012.4083251953125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, "epoch": 0.23999317231373218, - "grad_norm": 2.7264370918273926, - "kl": 6.1015625, - "learning_rate": 9.473767429958846e-07, - "loss": 0.4261, - "num_tokens": 456024540.0, - "reward": 0.78515625, - "reward_std": 0.335408091545105, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.748046875, - "rewards/tag_count_reward/std": 0.28151825070381165, + "grad_norm": 5.711365222930908, + "kl": 1.1943359375, + "learning_rate": 9.475890821738894e-07, + "loss": 0.2002, + "num_tokens": 490409451.0, + "reward": 1.01611328125, + "reward_std": 0.3051610589027405, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90673828125, + "rewards/tag_count_reward/std": 0.2276582568883896, "step": 703 }, { @@ -20402,27 +20402,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 775.990234375, - "completions/mean_terminated_length": 705.1773681640625, - "completions/min_length": 21.0, - "completions/min_terminated_length": 21.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1059.8671875, + "completions/mean_terminated_length": 966.9658813476562, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.240334556627123, - "grad_norm": 5.046159267425537, - "kl": 7.0, - "learning_rate": 9.471248864047415e-07, - "loss": 0.4507, - "num_tokens": 456497527.0, - "reward": 0.81005859375, - "reward_std": 0.34700971841812134, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.76123046875, - "rewards/tag_count_reward/std": 0.2866840064525604, + "grad_norm": 5.511206150054932, + "kl": 1.140625, + "learning_rate": 9.47337606581264e-07, + "loss": 0.2077, + "num_tokens": 491027783.0, + "reward": 1.0263671875, + "reward_std": 0.29222220182418823, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.2060847282409668, "step": 704 }, { @@ -20431,27 +20431,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 800.5, - "completions/mean_terminated_length": 725.5983276367188, - "completions/min_length": 68.0, - "completions/min_terminated_length": 68.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1002.513671875, + "completions/mean_terminated_length": 932.8146362304688, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.24067594094051378, - "grad_norm": 9.976103782653809, - "kl": 8.1796875, - "learning_rate": 9.468724661674571e-07, - "loss": 0.5195, - "num_tokens": 456982023.0, - "reward": 0.88720703125, - "reward_std": 0.4148992896080017, - "rewards/accuracy_reward/mean": 0.12096773833036423, - "rewards/accuracy_reward/std": 0.32641899585723877, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.76220703125, - "rewards/tag_count_reward/std": 0.2891928553581238, + "grad_norm": 4.493340492248535, + "kl": 0.8916015625, + "learning_rate": 9.470855666126176e-07, + "loss": 0.1246, + "num_tokens": 491615710.0, + "reward": 1.13671875, + "reward_std": 0.29003408551216125, + "rewards/accuracy_reward/mean": 0.21572580933570862, + "rewards/accuracy_reward/std": 0.4117402136325836, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.19257840514183044, "step": 705 }, { @@ -20460,27 +20460,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.12109375, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 884.802734375, - "completions/mean_terminated_length": 724.5400390625, - "completions/min_length": 35.0, - "completions/min_terminated_length": 35.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1071.4453125, + "completions/mean_terminated_length": 963.4099731445312, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.2410173252539046, - "grad_norm": 8.916975021362305, - "kl": 9.1171875, - "learning_rate": 9.466194826422961e-07, - "loss": 0.606, - "num_tokens": 457515922.0, - "reward": 0.8720703125, - "reward_std": 0.4169638752937317, - "rewards/accuracy_reward/mean": 0.12109375, - "rewards/accuracy_reward/std": 0.3265552520751953, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.7470703125, - "rewards/tag_count_reward/std": 0.3000182807445526, + "grad_norm": 76.13548278808594, + "kl": 3.16796875, + "learning_rate": 9.468329626259459e-07, + "loss": 0.3033, + "num_tokens": 492245170.0, + "reward": 1.1005859375, + "reward_std": 0.2980038821697235, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39069411158561707, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.22451218962669373, "step": 706 }, { @@ -20489,27 +20489,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 897.111328125, - "completions/mean_terminated_length": 732.6986694335938, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 1028.552734375, + "completions/mean_terminated_length": 944.496826171875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.24135870956729538, - "grad_norm": 6.31559944152832, - "kl": 8.90625, - "learning_rate": 9.463659361883219e-07, - "loss": 0.6457, - "num_tokens": 458057579.0, - "reward": 0.796875, - "reward_std": 0.37265852093696594, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.7421875, - "rewards/tag_count_reward/std": 0.30155742168426514, + "grad_norm": 4.781183242797852, + "kl": 1.1416015625, + "learning_rate": 9.465797949800462e-07, + "loss": 0.1637, + "num_tokens": 492854125.0, + "reward": 1.04345703125, + "reward_std": 0.27039211988449097, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.20703940093517303, "step": 707 }, { @@ -20518,27 +20518,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.10546875, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 894.158203125, - "completions/mean_terminated_length": 758.11572265625, - "completions/min_length": 87.0, - "completions/min_terminated_length": 87.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1047.654296875, + "completions/mean_terminated_length": 948.90771484375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.2417000938806862, - "grad_norm": 2.506101131439209, - "kl": 7.1328125, - "learning_rate": 9.46111827165398e-07, - "loss": 0.5252, - "num_tokens": 458596572.0, - "reward": 0.77197265625, - "reward_std": 0.33300668001174927, - "rewards/accuracy_reward/mean": 0.01953125, - "rewards/accuracy_reward/std": 0.1385180652141571, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.74462890625, - "rewards/tag_count_reward/std": 0.2910865247249603, + "grad_norm": 7.940341472625732, + "kl": 1.4130859375, + "learning_rate": 9.463260640345164e-07, + "loss": 0.2157, + "num_tokens": 493471708.0, + "reward": 0.9873046875, + "reward_std": 0.26273661851882935, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.21526481211185455, "step": 708 }, { @@ -20547,27 +20547,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.07421875, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 835.6171875, - "completions/mean_terminated_length": 738.421875, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 1103.5625, + "completions/mean_terminated_length": 989.8993530273438, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.24204147819407698, - "grad_norm": 10.483278274536133, - "kl": 4.9921875, - "learning_rate": 9.458571559341849e-07, - "loss": 0.3835, - "num_tokens": 459093640.0, - "reward": 0.8203125, - "reward_std": 0.3339378237724304, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.7734375, - "rewards/tag_count_reward/std": 0.27123603224754333, + "grad_norm": 3.457846164703369, + "kl": 1.0087890625, + "learning_rate": 9.460717701497546e-07, + "loss": 0.1961, + "num_tokens": 494105964.0, + "reward": 0.97607421875, + "reward_std": 0.24296388030052185, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90576171875, + "rewards/tag_count_reward/std": 0.22832883894443512, "step": 709 }, { @@ -20576,27 +20576,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 740.6640625, - "completions/mean_terminated_length": 656.407470703125, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 993.1796875, + "completions/mean_terminated_length": 922.8583984375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, "epoch": 0.2423828625074678, - "grad_norm": 14.420405387878418, - "kl": 4.7890625, - "learning_rate": 9.456019228561425e-07, - "loss": 0.3945, - "num_tokens": 459544044.0, - "reward": 0.90966796875, - "reward_std": 0.3614538908004761, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.80810546875, - "rewards/tag_count_reward/std": 0.26802483201026917, + "grad_norm": 3.3752620220184326, + "kl": 0.65283203125, + "learning_rate": 9.458169136869581e-07, + "loss": 0.1428, + "num_tokens": 494685656.0, + "reward": 1.04833984375, + "reward_std": 0.2690609097480774, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.18708612024784088, "step": 710 }, { @@ -20605,27 +20605,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.083984375, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 838.923828125, - "completions/mean_terminated_length": 728.0703735351562, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1067.66015625, + "completions/mean_terminated_length": 997.9288330078125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.24272424682085858, - "grad_norm": 5.348476409912109, - "kl": 5.1328125, - "learning_rate": 9.453461282935271e-07, - "loss": 0.3805, - "num_tokens": 460050149.0, - "reward": 0.88134765625, - "reward_std": 0.365867555141449, - "rewards/accuracy_reward/mean": 0.09677419066429138, - "rewards/accuracy_reward/std": 0.2959485352039337, - "rewards/format_reward/mean": 0.01171875, - "rewards/format_reward/std": 0.10772226005792618, - "rewards/tag_count_reward/mean": 0.77587890625, - "rewards/tag_count_reward/std": 0.2857325077056885, + "grad_norm": 4.665998458862305, + "kl": 0.51708984375, + "learning_rate": 9.455614950081236e-07, + "loss": 0.1397, + "num_tokens": 495308874.0, + "reward": 1.0986328125, + "reward_std": 0.2627273499965668, + "rewards/accuracy_reward/mean": 0.16733871400356293, + "rewards/accuracy_reward/std": 0.37365487217903137, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.1820572316646576, "step": 711 }, { @@ -20634,27 +20634,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08203125, + "completions/clipped_ratio": 0.103515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 820.046875, - "completions/mean_terminated_length": 710.3148803710938, - "completions/min_length": 22.0, - "completions/min_terminated_length": 22.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1091.50390625, + "completions/mean_terminated_length": 981.0587768554688, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, "epoch": 0.2430656311342494, - "grad_norm": 6.827632427215576, - "kl": 6.3515625, - "learning_rate": 9.450897726093924e-07, - "loss": 0.4942, - "num_tokens": 460545565.0, - "reward": 0.869140625, - "reward_std": 0.37961816787719727, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.76953125, - "rewards/tag_count_reward/std": 0.2898467481136322, + "grad_norm": 13.149555206298828, + "kl": 1.359375, + "learning_rate": 9.453055144760462e-07, + "loss": 0.2086, + "num_tokens": 495943276.0, + "reward": 1.05615234375, + "reward_std": 0.2935149073600769, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.22590121626853943, "step": 712 }, { @@ -20663,27 +20663,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.09375, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 825.546875, - "completions/mean_terminated_length": 699.086181640625, - "completions/min_length": 56.0, - "completions/min_terminated_length": 56.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1078.240234375, + "completions/mean_terminated_length": 944.62890625, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, "epoch": 0.24340701544764018, - "grad_norm": 7.4668121337890625, - "kl": 7.359375, - "learning_rate": 9.448328561675883e-07, - "loss": 0.5807, - "num_tokens": 461042709.0, - "reward": 0.8330078125, - "reward_std": 0.32038432359695435, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.7900390625, - "rewards/tag_count_reward/std": 0.2737869620323181, + "grad_norm": 3.737703561782837, + "kl": 0.9765625, + "learning_rate": 9.450489724543195e-07, + "loss": 0.1947, + "num_tokens": 496569799.0, + "reward": 0.97216796875, + "reward_std": 0.2608855962753296, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89404296875, + "rewards/tag_count_reward/std": 0.235911563038826, "step": 713 }, { @@ -20692,27 +20692,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0703125, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 775.900390625, - "completions/mean_terminated_length": 679.6912231445312, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1054.4609375, + "completions/mean_terminated_length": 954.0387573242188, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.243748399761031, - "grad_norm": 7.458494663238525, - "kl": 7.953125, - "learning_rate": 9.44575379332761e-07, - "loss": 0.514, - "num_tokens": 461517026.0, - "reward": 0.8349609375, - "reward_std": 0.33914488554000854, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.013671875, - "rewards/format_reward/std": 0.1162383034825325, - "rewards/tag_count_reward/mean": 0.7724609375, - "rewards/tag_count_reward/std": 0.2810613811016083, + "grad_norm": 4.930484294891357, + "kl": 0.958984375, + "learning_rate": 9.447918693073339e-07, + "loss": 0.1872, + "num_tokens": 497186739.0, + "reward": 1.01318359375, + "reward_std": 0.2567800283432007, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.21517324447631836, "step": 714 }, { @@ -20721,27 +20721,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.083984375, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 811.75, - "completions/mean_terminated_length": 698.4051513671875, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1038.640625, + "completions/mean_terminated_length": 973.58837890625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.24408978407442178, - "grad_norm": 8.241525650024414, - "kl": 9.359375, - "learning_rate": 9.443173424703514e-07, - "loss": 0.6233, - "num_tokens": 462003106.0, - "reward": 0.8427734375, - "reward_std": 0.35059112310409546, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.7646484375, - "rewards/tag_count_reward/std": 0.277199923992157, + "grad_norm": 7.069862365722656, + "kl": 0.978515625, + "learning_rate": 9.445342054002775e-07, + "loss": 0.1449, + "num_tokens": 497788987.0, + "reward": 1.0400390625, + "reward_std": 0.23149043321609497, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.17932851612567902, "step": 715 }, { @@ -20750,27 +20750,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.068359375, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 850.09765625, - "completions/mean_terminated_length": 762.2012329101562, - "completions/min_length": 34.0, - "completions/min_terminated_length": 34.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1149.5078125, + "completions/mean_terminated_length": 1041.3741455078125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, "epoch": 0.2444311683878126, - "grad_norm": 14.613713264465332, - "kl": 8.53125, - "learning_rate": 9.440587459465956e-07, - "loss": 0.4895, - "num_tokens": 462516420.0, - "reward": 0.80224609375, - "reward_std": 0.3848583698272705, - "rewards/accuracy_reward/mean": 0.04233871027827263, - "rewards/accuracy_reward/std": 0.2015640139579773, - "rewards/format_reward/mean": 0.021484375, - "rewards/format_reward/std": 0.14513419568538666, - "rewards/tag_count_reward/mean": 0.73974609375, - "rewards/tag_count_reward/std": 0.2884219288825989, + "grad_norm": 7.361259937286377, + "kl": 0.9892578125, + "learning_rate": 9.442759810991345e-07, + "loss": 0.1988, + "num_tokens": 498455599.0, + "reward": 0.96435546875, + "reward_std": 0.28180551528930664, + "rewards/accuracy_reward/mean": 0.058467742055654526, + "rewards/accuracy_reward/std": 0.23486268520355225, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.22372503578662872, "step": 716 }, { @@ -20779,27 +20779,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.06640625, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 868.58203125, - "completions/mean_terminated_length": 784.6903686523438, - "completions/min_length": 54.0, - "completions/min_terminated_length": 54.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1154.73828125, + "completions/mean_terminated_length": 1055.9176025390625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.24477255270120338, - "grad_norm": 13.657511711120605, - "kl": 9.0859375, - "learning_rate": 9.437995901285246e-07, - "loss": 0.5348, - "num_tokens": 463034974.0, - "reward": 0.8486328125, - "reward_std": 0.3591812252998352, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.7724609375, - "rewards/tag_count_reward/std": 0.2726678252220154, + "grad_norm": 2.7958168983459473, + "kl": 1.0244140625, + "learning_rate": 9.440171967706852e-07, + "loss": 0.1561, + "num_tokens": 499120665.0, + "reward": 1.02734375, + "reward_std": 0.293550968170166, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.896484375, + "rewards/tag_count_reward/std": 0.23621970415115356, "step": 717 }, { @@ -20808,27 +20808,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08203125, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1948.0, - "completions/mean_length": 801.046875, - "completions/mean_terminated_length": 689.6170043945312, - "completions/min_length": 69.0, - "completions/min_terminated_length": 69.0, + "completions/max_terminated_length": 1952.0, + "completions/mean_length": 1010.73828125, + "completions/mean_terminated_length": 952.9938354492188, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.2451139370145942, - "grad_norm": 7.152915000915527, - "kl": 8.328125, - "learning_rate": 9.435398753839622e-07, - "loss": 0.5683, - "num_tokens": 463519878.0, - "reward": 0.8486328125, - "reward_std": 0.36477065086364746, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.015625, - "rewards/format_reward/std": 0.12414088100194931, - "rewards/tag_count_reward/mean": 0.7724609375, - "rewards/tag_count_reward/std": 0.2722189128398895, + "grad_norm": 8.287996292114258, + "kl": 1.041015625, + "learning_rate": 9.437578527825055e-07, + "loss": 0.1763, + "num_tokens": 499712931.0, + "reward": 1.072265625, + "reward_std": 0.26099175214767456, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.1783159226179123, "step": 718 }, { @@ -20837,27 +20837,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 802.802734375, - "completions/mean_terminated_length": 719.7896118164062, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1062.5546875, + "completions/mean_terminated_length": 992.460205078125, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, "epoch": 0.24545532132798498, - "grad_norm": 2.03483510017395, - "kl": 7.25, - "learning_rate": 9.432796020815261e-07, - "loss": 0.5188, - "num_tokens": 464007617.0, - "reward": 0.826171875, - "reward_std": 0.33768337965011597, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.7734375, - "rewards/tag_count_reward/std": 0.2844424843788147, + "grad_norm": 4.7984771728515625, + "kl": 0.6640625, + "learning_rate": 9.434979495029658e-07, + "loss": 0.1494, + "num_tokens": 500333663.0, + "reward": 1.0478515625, + "reward_std": 0.24851752817630768, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.18982726335525513, "step": 719 }, { @@ -20866,27 +20866,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 758.4921875, - "completions/mean_terminated_length": 695.07373046875, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1058.337890625, + "completions/mean_terminated_length": 974.4682006835938, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.2457967056413758, - "grad_norm": 5.960138320922852, - "kl": 5.21875, - "learning_rate": 9.430187705906268e-07, - "loss": 0.3848, - "num_tokens": 464465917.0, - "reward": 0.86767578125, - "reward_std": 0.3609304130077362, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.013671875, - "rewards/format_reward/std": 0.1162383034825325, - "rewards/tag_count_reward/mean": 0.78369140625, - "rewards/tag_count_reward/std": 0.2712813913822174, + "grad_norm": 7.418297290802002, + "kl": 0.6455078125, + "learning_rate": 9.432374873012313e-07, + "loss": 0.1462, + "num_tokens": 500945484.0, + "reward": 1.04150390625, + "reward_std": 0.25468748807907104, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.20525948703289032, "step": 720 }, { @@ -20895,27 +20895,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 803.236328125, - "completions/mean_terminated_length": 739.3367919921875, - "completions/min_length": 24.0, - "completions/min_terminated_length": 24.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1084.05859375, + "completions/mean_terminated_length": 995.68017578125, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, "epoch": 0.24613808995476658, - "grad_norm": 7.317914009094238, - "kl": 4.87890625, - "learning_rate": 9.427573812814666e-07, - "loss": 0.3755, - "num_tokens": 464947862.0, - "reward": 0.89453125, - "reward_std": 0.3675106167793274, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.791015625, - "rewards/tag_count_reward/std": 0.27139803767204285, + "grad_norm": 4.748532295227051, + "kl": 0.8251953125, + "learning_rate": 9.429764665472612e-07, + "loss": 0.159, + "num_tokens": 501571210.0, + "reward": 1.09716796875, + "reward_std": 0.3214423656463623, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3810062110424042, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.20877666771411896, "step": 721 }, { @@ -20924,27 +20924,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1968.0, - "completions/mean_length": 773.197265625, - "completions/mean_terminated_length": 702.2288818359375, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 988.26171875, + "completions/mean_terminated_length": 926.9545288085938, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.2464794742681574, - "grad_norm": 7.03244686126709, - "kl": 4.75390625, - "learning_rate": 9.424954345250401e-07, - "loss": 0.3533, - "num_tokens": 465410731.0, - "reward": 0.83984375, - "reward_std": 0.29207414388656616, - "rewards/accuracy_reward/mean": 0.021484375, - "rewards/accuracy_reward/std": 0.14513419568538666, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.80859375, - "rewards/tag_count_reward/std": 0.2630811929702759, + "grad_norm": 3.508500099182129, + "kl": 0.65087890625, + "learning_rate": 9.427148876118077e-07, + "loss": 0.1545, + "num_tokens": 502144192.0, + "reward": 0.9697265625, + "reward_std": 0.1716979444026947, + "rewards/accuracy_reward/mean": 0.02734375, + "rewards/accuracy_reward/std": 0.16324250400066376, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.17514485120773315, "step": 722 }, { @@ -20953,27 +20953,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0859375, + "completions/clipped_ratio": 0.095703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 816.486328125, - "completions/mean_terminated_length": 700.7030029296875, - "completions/min_length": 50.0, - "completions/min_terminated_length": 50.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1053.5078125, + "completions/mean_terminated_length": 948.2591552734375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.24682085858154817, - "grad_norm": 5.78585147857666, - "kl": 5.56640625, - "learning_rate": 9.422329306931325e-07, - "loss": 0.4128, - "num_tokens": 465902532.0, - "reward": 0.83154296875, - "reward_std": 0.3125340938568115, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.79833984375, - "rewards/tag_count_reward/std": 0.27311384677886963, + "grad_norm": 4.22104024887085, + "kl": 1.134765625, + "learning_rate": 9.424527508664157e-07, + "loss": 0.1824, + "num_tokens": 502757348.0, + "reward": 0.9873046875, + "reward_std": 0.20495033264160156, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.21240492165088654, "step": 723 }, { @@ -20982,27 +20982,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.146484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 797.927734375, - "completions/mean_terminated_length": 733.7556762695312, - "completions/min_length": 35.0, - "completions/min_terminated_length": 35.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1143.0859375, + "completions/mean_terminated_length": 987.7802734375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, "epoch": 0.247162242894939, - "grad_norm": 2.1381256580352783, - "kl": 6.3203125, - "learning_rate": 9.419698701583204e-07, - "loss": 0.4465, - "num_tokens": 466392495.0, - "reward": 0.8359375, - "reward_std": 0.367983341217041, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.013671875, - "rewards/format_reward/std": 0.1162383034825325, - "rewards/tag_count_reward/mean": 0.767578125, - "rewards/tag_count_reward/std": 0.28400543332099915, + "grad_norm": 19.98847198486328, + "kl": 1.771484375, + "learning_rate": 9.421900566834233e-07, + "loss": 0.2436, + "num_tokens": 503424032.0, + "reward": 0.96630859375, + "reward_std": 0.3224031925201416, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87841796875, + "rewards/tag_count_reward/std": 0.26075315475463867, "step": 724 }, { @@ -21011,27 +21011,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.07421875, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 818.634765625, - "completions/mean_terminated_length": 720.0780029296875, - "completions/min_length": 52.0, - "completions/min_terminated_length": 52.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1040.48046875, + "completions/mean_terminated_length": 919.2254028320312, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.24750362720832977, - "grad_norm": 4.268383502960205, - "kl": 8.0078125, - "learning_rate": 9.417062532939698e-07, - "loss": 0.5542, - "num_tokens": 466887652.0, - "reward": 0.82861328125, - "reward_std": 0.3431798815727234, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.77978515625, - "rewards/tag_count_reward/std": 0.2748715281486511, + "grad_norm": 3.7426517009735107, + "kl": 1.201171875, + "learning_rate": 9.419268054359594e-07, + "loss": 0.2092, + "num_tokens": 504032774.0, + "reward": 0.98486328125, + "reward_std": 0.2758023142814636, + "rewards/accuracy_reward/mean": 0.08669354766607285, + "rewards/accuracy_reward/std": 0.281669557094574, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90087890625, + "rewards/tag_count_reward/std": 0.23264455795288086, "step": 725 }, { @@ -21040,27 +21040,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.076171875, + "completions/clipped_ratio": 0.09765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 811.517578125, - "completions/mean_terminated_length": 709.5665893554688, - "completions/min_length": 10.0, - "completions/min_terminated_length": 10.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 1043.919921875, + "completions/mean_terminated_length": 935.2532348632812, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.2478450115217206, - "grad_norm": 3.5171008110046387, - "kl": 7.375, - "learning_rate": 9.414420804742366e-07, - "loss": 0.4769, - "num_tokens": 467385501.0, - "reward": 0.80908203125, - "reward_std": 0.3296193480491638, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.015625, - "rewards/format_reward/std": 0.12414088100194931, - "rewards/tag_count_reward/mean": 0.76220703125, - "rewards/tag_count_reward/std": 0.27929604053497314, + "grad_norm": 14.944722175598145, + "kl": 1.8857421875, + "learning_rate": 9.416629974979448e-07, + "loss": 0.2595, + "num_tokens": 504649613.0, + "reward": 0.96533203125, + "reward_std": 0.24623164534568787, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90869140625, + "rewards/tag_count_reward/std": 0.22898492217063904, "step": 726 }, { @@ -21069,27 +21069,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 797.9375, - "completions/mean_terminated_length": 714.6000366210938, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1072.001953125, + "completions/mean_terminated_length": 937.5311279296875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.24818639583511137, - "grad_norm": 10.195523262023926, - "kl": 6.859375, - "learning_rate": 9.41177352074066e-07, - "loss": 0.4419, - "num_tokens": 467876237.0, - "reward": 0.79736328125, - "reward_std": 0.3004957437515259, - "rewards/accuracy_reward/mean": 0.009765625, - "rewards/accuracy_reward/std": 0.09843364357948303, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.77783203125, - "rewards/tag_count_reward/std": 0.26968812942504883, + "grad_norm": 2.1958847045898438, + "kl": 0.9375, + "learning_rate": 9.413986332440903e-07, + "loss": 0.1899, + "num_tokens": 505280670.0, + "reward": 0.91455078125, + "reward_std": 0.22004647552967072, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15143637359142303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89111328125, + "rewards/tag_count_reward/std": 0.24377600848674774, "step": 727 }, { @@ -21098,27 +21098,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 800.2578125, - "completions/mean_terminated_length": 733.5061645507812, - "completions/min_length": 40.0, - "completions/min_terminated_length": 40.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1069.27734375, + "completions/mean_terminated_length": 981.8170166015625, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, "epoch": 0.2485277801485022, - "grad_norm": 3.3900604248046875, - "kl": 6.796875, - "learning_rate": 9.409120684691915e-07, - "loss": 0.4141, - "num_tokens": 468363137.0, - "reward": 0.81640625, - "reward_std": 0.321017324924469, - "rewards/accuracy_reward/mean": 0.032258063554763794, - "rewards/accuracy_reward/std": 0.17686307430267334, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.78125, - "rewards/tag_count_reward/std": 0.26542437076568604, + "grad_norm": 2.0797388553619385, + "kl": 0.96484375, + "learning_rate": 9.411337130498977e-07, + "loss": 0.1611, + "num_tokens": 505905308.0, + "reward": 0.9814453125, + "reward_std": 0.23239141702651978, + "rewards/accuracy_reward/mean": 0.058467742055654526, + "rewards/accuracy_reward/std": 0.23486268520355225, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.20680677890777588, "step": 728 }, { @@ -21127,27 +21127,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1963.0, - "completions/mean_length": 793.02734375, - "completions/mean_terminated_length": 712.1455078125, - "completions/min_length": 20.0, - "completions/min_terminated_length": 20.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1065.705078125, + "completions/mean_terminated_length": 995.834716796875, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, "epoch": 0.24886916446189297, - "grad_norm": 2.7198610305786133, - "kl": 6.46875, - "learning_rate": 9.406462300361345e-07, - "loss": 0.4298, - "num_tokens": 468844687.0, - "reward": 0.80908203125, - "reward_std": 0.3425788879394531, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.01171875, - "rewards/format_reward/std": 0.10772226005792618, - "rewards/tag_count_reward/mean": 0.76025390625, - "rewards/tag_count_reward/std": 0.2802489399909973, + "grad_norm": 1.9189743995666504, + "kl": 0.744140625, + "learning_rate": 9.408682372916582e-07, + "loss": 0.1508, + "num_tokens": 506526469.0, + "reward": 1.04833984375, + "reward_std": 0.2662036418914795, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.19102340936660767, "step": 729 }, { @@ -21156,27 +21156,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 744.380859375, - "completions/mean_terminated_length": 680.2684326171875, - "completions/min_length": 34.0, - "completions/min_terminated_length": 34.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 1039.390625, + "completions/mean_terminated_length": 965.3836059570312, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.24921054877528379, - "grad_norm": 1.6548537015914917, - "kl": 5.2265625, - "learning_rate": 9.403798371522042e-07, - "loss": 0.3176, - "num_tokens": 469297794.0, - "reward": 0.88818359375, - "reward_std": 0.3665573298931122, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.021484375, - "rewards/format_reward/std": 0.14513419568538666, - "rewards/tag_count_reward/mean": 0.77685546875, - "rewards/tag_count_reward/std": 0.2661357820034027, + "grad_norm": 1.7278939485549927, + "kl": 0.82373046875, + "learning_rate": 9.406022063464516e-07, + "loss": 0.1684, + "num_tokens": 507130621.0, + "reward": 1.06787109375, + "reward_std": 0.2978924512863159, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.18455612659454346, "step": 730 }, { @@ -21185,27 +21185,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 795.10546875, - "completions/mean_terminated_length": 738.85302734375, - "completions/min_length": 66.0, - "completions/min_terminated_length": 66.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1125.212890625, + "completions/mean_terminated_length": 1036.2933349609375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.24955193308867457, - "grad_norm": 8.122745513916016, - "kl": 4.5234375, - "learning_rate": 9.401128901954964e-07, - "loss": 0.3331, - "num_tokens": 469783624.0, - "reward": 0.85302734375, - "reward_std": 0.3369097113609314, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.015625, - "rewards/format_reward/std": 0.12414088100194931, - "rewards/tag_count_reward/mean": 0.78662109375, - "rewards/tag_count_reward/std": 0.2699962258338928, + "grad_norm": 2.8078134059906006, + "kl": 0.8388671875, + "learning_rate": 9.403356205921471e-07, + "loss": 0.1912, + "num_tokens": 507785466.0, + "reward": 0.99462890625, + "reward_std": 0.26011571288108826, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90869140625, + "rewards/tag_count_reward/std": 0.21804066002368927, "step": 731 }, { @@ -21214,27 +21214,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 736.365234375, - "completions/mean_terminated_length": 657.6128540039062, - "completions/min_length": 34.0, - "completions/min_terminated_length": 34.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 956.703125, + "completions/mean_terminated_length": 898.3209838867188, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.24989331740206538, - "grad_norm": 3.018124580383301, - "kl": 5.046875, - "learning_rate": 9.398453895448936e-07, - "loss": 0.3587, - "num_tokens": 470232739.0, - "reward": 0.83642578125, - "reward_std": 0.3421552777290344, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.013671875, - "rewards/format_reward/std": 0.1162383034825325, - "rewards/tag_count_reward/mean": 0.78369140625, - "rewards/tag_count_reward/std": 0.26765021681785583, + "grad_norm": 2.4974358081817627, + "kl": 0.546875, + "learning_rate": 9.400684804074015e-07, + "loss": 0.1341, + "num_tokens": 508347394.0, + "reward": 1.06689453125, + "reward_std": 0.2672039270401001, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95166015625, + "rewards/tag_count_reward/std": 0.167671337723732, "step": 732 }, { @@ -21243,27 +21243,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 741.564453125, - "completions/mean_terminated_length": 674.4990234375, - "completions/min_length": 9.0, - "completions/min_terminated_length": 9.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1021.470703125, + "completions/mean_terminated_length": 917.7139892578125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.2502347017154562, - "grad_norm": 4.018496036529541, - "kl": 4.15234375, - "learning_rate": 9.395773355800643e-07, - "loss": 0.268, - "num_tokens": 470697764.0, - "reward": 0.87939453125, - "reward_std": 0.352216899394989, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.015625, - "rewards/format_reward/std": 0.12414088100194931, - "rewards/tag_count_reward/mean": 0.80712890625, - "rewards/tag_count_reward/std": 0.2551484704017639, + "grad_norm": 8.79793930053711, + "kl": 1.10107421875, + "learning_rate": 9.398007861716589e-07, + "loss": 0.191, + "num_tokens": 508955731.0, + "reward": 1.01416015625, + "reward_std": 0.279678076505661, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.2145996242761612, "step": 733 }, { @@ -21272,27 +21272,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 751.60546875, - "completions/mean_terminated_length": 704.3684692382812, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1060.44140625, + "completions/mean_terminated_length": 987.97900390625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.25057608602884696, - "grad_norm": 2.691394805908203, - "kl": 5.234375, - "learning_rate": 9.393087286814616e-07, - "loss": 0.3312, - "num_tokens": 471164554.0, - "reward": 0.82421875, - "reward_std": 0.31793129444122314, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.01171875, - "rewards/format_reward/std": 0.10772226005792618, - "rewards/tag_count_reward/mean": 0.78515625, - "rewards/tag_count_reward/std": 0.2588631808757782, + "grad_norm": 2.8684802055358887, + "kl": 0.6962890625, + "learning_rate": 9.39532538265151e-07, + "loss": 0.1673, + "num_tokens": 509580645.0, + "reward": 0.962890625, + "reward_std": 0.19349947571754456, + "rewards/accuracy_reward/mean": 0.025390625, + "rewards/accuracy_reward/std": 0.15746226906776428, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.18373169004917145, "step": 734 }, { @@ -21301,27 +21301,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 725.771484375, - "completions/mean_terminated_length": 683.1189575195312, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 998.53125, + "completions/mean_terminated_length": 907.17626953125, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, "epoch": 0.25091747034223777, - "grad_norm": 3.793814182281494, - "kl": 5.13671875, - "learning_rate": 9.39039569230324e-07, - "loss": 0.347, - "num_tokens": 471614277.0, - "reward": 0.8583984375, - "reward_std": 0.34439292550086975, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.01171875, - "rewards/format_reward/std": 0.10772226005792618, - "rewards/tag_count_reward/mean": 0.7958984375, - "rewards/tag_count_reward/std": 0.2660554349422455, + "grad_norm": 3.143089532852173, + "kl": 0.7255859375, + "learning_rate": 9.392637370688951e-07, + "loss": 0.1674, + "num_tokens": 510170021.0, + "reward": 1.06591796875, + "reward_std": 0.2568363547325134, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.1979750692844391, "step": 735 }, { @@ -21330,27 +21330,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1919.0, - "completions/mean_length": 733.33984375, - "completions/mean_terminated_length": 679.8983764648438, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1039.94921875, + "completions/mean_terminated_length": 965.9832153320312, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, "epoch": 0.2512588546556286, - "grad_norm": 2.680269718170166, - "kl": 6.0625, - "learning_rate": 9.387698576086743e-07, - "loss": 0.3465, - "num_tokens": 472067875.0, - "reward": 0.7998046875, - "reward_std": 0.33144643902778625, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.7626953125, - "rewards/tag_count_reward/std": 0.27641287446022034, + "grad_norm": 2.5431292057037354, + "kl": 0.58837890625, + "learning_rate": 9.389943829646953e-07, + "loss": 0.1749, + "num_tokens": 510780603.0, + "reward": 1.021484375, + "reward_std": 0.25412070751190186, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.1953079104423523, "step": 736 }, { @@ -21359,27 +21359,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1886.0, - "completions/mean_length": 656.005859375, - "completions/mean_terminated_length": 611.102783203125, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 913.486328125, + "completions/mean_terminated_length": 857.6905517578125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.2516002389690194, - "grad_norm": 2.185762882232666, - "kl": 5.390625, - "learning_rate": 9.384995941993187e-07, - "loss": 0.3094, - "num_tokens": 472473798.0, - "reward": 0.87548828125, - "reward_std": 0.3545587956905365, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.79541015625, - "rewards/tag_count_reward/std": 0.25939005613327026, + "grad_norm": 3.322721481323242, + "kl": 0.6826171875, + "learning_rate": 9.387244763351403e-07, + "loss": 0.1552, + "num_tokens": 511318356.0, + "reward": 1.10400390625, + "reward_std": 0.2542824447154999, + "rewards/accuracy_reward/mean": 0.16330644488334656, + "rewards/accuracy_reward/std": 0.37001824378967285, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94580078125, + "rewards/tag_count_reward/std": 0.171664297580719, "step": 737 }, { @@ -21388,27 +21388,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 773.515625, - "completions/mean_terminated_length": 721.707275390625, - "completions/min_length": 12.0, - "completions/min_terminated_length": 12.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1052.099609375, + "completions/mean_terminated_length": 976.7794799804688, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.25194162328241015, - "grad_norm": 5.866791725158691, - "kl": 6.046875, - "learning_rate": 9.382287793858467e-07, - "loss": 0.334, - "num_tokens": 472942782.0, - "reward": 0.85400390625, - "reward_std": 0.35613566637039185, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.77978515625, - "rewards/tag_count_reward/std": 0.26947900652885437, + "grad_norm": 5.92504358291626, + "kl": 0.9052734375, + "learning_rate": 9.384540175636042e-07, + "loss": 0.1786, + "num_tokens": 511929975.0, + "reward": 1.0703125, + "reward_std": 0.2450016289949417, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.1895880103111267, "step": 738 }, { @@ -21417,27 +21417,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 712.3984375, - "completions/mean_terminated_length": 666.529296875, - "completions/min_length": 14.0, - "completions/min_terminated_length": 14.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 988.81640625, + "completions/mean_terminated_length": 908.7101440429688, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, "epoch": 0.25228300759580097, - "grad_norm": 2.493330717086792, - "kl": 5.6796875, - "learning_rate": 9.379574135526304e-07, - "loss": 0.3737, - "num_tokens": 473382570.0, - "reward": 0.8818359375, - "reward_std": 0.36612144112586975, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.013671875, - "rewards/format_reward/std": 0.1162383034825325, - "rewards/tag_count_reward/mean": 0.7822265625, - "rewards/tag_count_reward/std": 0.2666867971420288, + "grad_norm": 7.068661212921143, + "kl": 0.91845703125, + "learning_rate": 9.381830070342446e-07, + "loss": 0.1494, + "num_tokens": 512511289.0, + "reward": 1.1357421875, + "reward_std": 0.30168014764785767, + "rewards/accuracy_reward/mean": 0.197265625, + "rewards/accuracy_reward/std": 0.3983237147331238, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.19377298653125763, "step": 739 }, { @@ -21446,27 +21446,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 695.375, - "completions/mean_terminated_length": 646.089111328125, - "completions/min_length": 45.0, - "completions/min_terminated_length": 45.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 991.103515625, + "completions/mean_terminated_length": 925.3216552734375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.2526243919091918, - "grad_norm": 3.5300133228302, - "kl": 5.296875, - "learning_rate": 9.376854970848239e-07, - "loss": 0.3468, - "num_tokens": 473816058.0, - "reward": 0.85400390625, - "reward_std": 0.34921133518218994, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.77587890625, - "rewards/tag_count_reward/std": 0.26897501945495605, + "grad_norm": 13.25341796875, + "kl": 0.6748046875, + "learning_rate": 9.379114451320039e-07, + "loss": 0.1429, + "num_tokens": 513096190.0, + "reward": 1.0625, + "reward_std": 0.2925964891910553, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.18567688763141632, "step": 740 }, { @@ -21475,27 +21475,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1957.0, - "completions/mean_length": 717.44921875, - "completions/mean_terminated_length": 643.3773193359375, - "completions/min_length": 37.0, - "completions/min_terminated_length": 37.0, + "completions/max_terminated_length": 1942.0, + "completions/mean_length": 1039.828125, + "completions/mean_terminated_length": 972.61669921875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.2529657762225826, - "grad_norm": 4.015928268432617, - "kl": 5.875, - "learning_rate": 9.374130303683628e-07, - "loss": 0.3583, - "num_tokens": 474258784.0, - "reward": 0.86572265625, - "reward_std": 0.35375192761421204, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.015625, - "rewards/format_reward/std": 0.12414088100194931, - "rewards/tag_count_reward/mean": 0.77783203125, - "rewards/tag_count_reward/std": 0.25998979806900024, + "grad_norm": 14.147185325622559, + "kl": 0.8779296875, + "learning_rate": 9.376393322426065e-07, + "loss": 0.1716, + "num_tokens": 513703974.0, + "reward": 1.09765625, + "reward_std": 0.2624806761741638, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.18033012747764587, "step": 741 }, { @@ -21504,27 +21504,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 741.83203125, - "completions/mean_terminated_length": 683.187744140625, - "completions/min_length": 11.0, - "completions/min_terminated_length": 11.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1033.697265625, + "completions/mean_terminated_length": 977.23095703125, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, "epoch": 0.25330716053597335, - "grad_norm": 2.450438976287842, - "kl": 4.73828125, - "learning_rate": 9.371400137899642e-07, - "loss": 0.2917, - "num_tokens": 474723482.0, - "reward": 0.80810546875, - "reward_std": 0.33236464858055115, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.013671875, - "rewards/format_reward/std": 0.1162383034825325, - "rewards/tag_count_reward/mean": 0.75341796875, - "rewards/tag_count_reward/std": 0.27513211965560913, + "grad_norm": 10.49718189239502, + "kl": 1.2021484375, + "learning_rate": 9.373666687525603e-07, + "loss": 0.1751, + "num_tokens": 514318107.0, + "reward": 1.041015625, + "reward_std": 0.23247992992401123, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.1816815733909607, "step": 742 }, { @@ -21533,56 +21533,56 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1963.0, - "completions/mean_length": 681.587890625, - "completions/mean_terminated_length": 651.5868530273438, - "completions/min_length": 68.0, - "completions/min_terminated_length": 68.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1021.82421875, + "completions/mean_terminated_length": 973.5582885742188, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.25364854484936417, - "grad_norm": 5.370906352996826, - "kl": 4.0, - "learning_rate": 9.368664477371246e-07, - "loss": 0.2457, - "num_tokens": 475151095.0, - "reward": 0.84375, - "reward_std": 0.3205603063106537, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.798828125, - "rewards/tag_count_reward/std": 0.2556764483451843, - "step": 743 - }, - { - "clip_ratio/high_max": 0.0, + "grad_norm": 11.299504280090332, + "kl": 0.947265625, + "learning_rate": 9.370934550491547e-07, + "loss": 0.1465, + "num_tokens": 514919921.0, + "reward": 1.01904296875, + "reward_std": 0.16841323673725128, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95458984375, + "rewards/tag_count_reward/std": 0.15954138338565826, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 729.072265625, - "completions/mean_terminated_length": 681.01416015625, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1062.951171875, + "completions/mean_terminated_length": 999.4656982421875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.253989929162755, - "grad_norm": 3.3952784538269043, - "kl": 3.8984375, - "learning_rate": 9.365923325981214e-07, - "loss": 0.2532, - "num_tokens": 475597372.0, - "reward": 0.810546875, - "reward_std": 0.31725451350212097, - "rewards/accuracy_reward/mean": 0.01953125, - "rewards/accuracy_reward/std": 0.1385180652141571, - "rewards/format_reward/mean": 0.013671875, - "rewards/format_reward/std": 0.1162383034825325, - "rewards/tag_count_reward/mean": 0.77734375, - "rewards/tag_count_reward/std": 0.26677456498146057, + "grad_norm": 11.003332138061523, + "kl": 1.568359375, + "learning_rate": 9.36819691520461e-07, + "loss": 0.136, + "num_tokens": 515537144.0, + "reward": 1.00341796875, + "reward_std": 0.23539161682128906, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.18454577028751373, "step": 744 }, { @@ -21591,27 +21591,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 697.048828125, - "completions/mean_terminated_length": 650.6525268554688, - "completions/min_length": 14.0, - "completions/min_terminated_length": 14.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 963.91015625, + "completions/mean_terminated_length": 912.9202270507812, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, "epoch": 0.2543313134761458, - "grad_norm": 2.641296863555908, - "kl": 5.328125, - "learning_rate": 9.363176687620109e-07, - "loss": 0.3497, - "num_tokens": 476029461.0, - "reward": 0.8369140625, - "reward_std": 0.3765317499637604, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.7490234375, - "rewards/tag_count_reward/std": 0.268628865480423, + "grad_norm": 23.343652725219727, + "kl": 2.296875, + "learning_rate": 9.365453785553313e-07, + "loss": 0.1733, + "num_tokens": 516105866.0, + "reward": 1.12841796875, + "reward_std": 0.2690393924713135, + "rewards/accuracy_reward/mean": 0.177734375, + "rewards/accuracy_reward/std": 0.3826628625392914, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95068359375, + "rewards/tag_count_reward/std": 0.16443736851215363, "step": 745 }, { @@ -21620,27 +21620,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 715.10546875, - "completions/mean_terminated_length": 655.2611694335938, - "completions/min_length": 51.0, - "completions/min_terminated_length": 51.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 966.7421875, + "completions/mean_terminated_length": 911.2361450195312, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.25467269778953655, - "grad_norm": 2.496892213821411, - "kl": 5.859375, - "learning_rate": 9.360424566186279e-07, - "loss": 0.3758, - "num_tokens": 476473035.0, - "reward": 0.8193359375, - "reward_std": 0.3387095034122467, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, + "grad_norm": 18.213146209716797, + "kl": 3.28125, + "learning_rate": 9.36270516543398e-07, + "loss": 0.2117, + "num_tokens": 516678278.0, + "reward": 1.10595703125, + "reward_std": 0.25160735845565796, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.7626953125, - "rewards/tag_count_reward/std": 0.26878535747528076, + "rewards/tag_count_reward/mean": 0.95166015625, + "rewards/tag_count_reward/std": 0.16323591768741608, "step": 746 }, { @@ -21649,27 +21649,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 648.65234375, - "completions/mean_terminated_length": 603.5120849609375, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 916.38671875, + "completions/mean_terminated_length": 858.2957153320312, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.25501408210292736, - "grad_norm": 4.981993675231934, - "kl": 5.578125, - "learning_rate": 9.357666965585859e-07, - "loss": 0.3321, - "num_tokens": 476886873.0, - "reward": 0.84375, - "reward_std": 0.35605841875076294, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.021484375, - "rewards/format_reward/std": 0.14513419568538666, - "rewards/tag_count_reward/mean": 0.775390625, - "rewards/tag_count_reward/std": 0.27285876870155334, + "grad_norm": 4412.37353515625, + "kl": 77.189453125, + "learning_rate": 9.359951058750738e-07, + "loss": 3.1958, + "num_tokens": 517229196.0, + "reward": 1.0654296875, + "reward_std": 0.2353786826133728, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9482421875, + "rewards/tag_count_reward/std": 0.16330981254577637, "step": 747 }, { @@ -21678,27 +21678,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 791.423828125, - "completions/mean_terminated_length": 724.1995849609375, - "completions/min_length": 8.0, - "completions/min_terminated_length": 8.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1036.7421875, + "completions/mean_terminated_length": 978.2396240234375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.2553554664163182, - "grad_norm": 1.5774364471435547, - "kl": 5.890625, - "learning_rate": 9.354903889732761e-07, - "loss": 0.4023, - "num_tokens": 477365858.0, - "reward": 0.8251953125, - "reward_std": 0.3529778718948364, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.013671875, - "rewards/format_reward/std": 0.1162383034825325, - "rewards/tag_count_reward/mean": 0.7646484375, - "rewards/tag_count_reward/std": 0.2645571529865265, + "grad_norm": 18.636999130249023, + "kl": 2.818359375, + "learning_rate": 9.357191469415501e-07, + "loss": 0.2048, + "num_tokens": 517833784.0, + "reward": 1.09326171875, + "reward_std": 0.23580974340438843, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94677734375, + "rewards/tag_count_reward/std": 0.16321250796318054, "step": 748 }, { @@ -21707,27 +21707,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1902.0, - "completions/mean_length": 682.677734375, - "completions/mean_terminated_length": 661.0059814453125, - "completions/min_length": 14.0, - "completions/min_terminated_length": 14.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1055.8515625, + "completions/mean_terminated_length": 1000.6185913085938, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, "epoch": 0.255696850729709, - "grad_norm": 2.7102158069610596, - "kl": 3.68359375, - "learning_rate": 9.352135342548659e-07, - "loss": 0.1737, - "num_tokens": 477801197.0, - "reward": 0.84228515625, - "reward_std": 0.3041990101337433, - "rewards/accuracy_reward/mean": 0.021484375, - "rewards/accuracy_reward/std": 0.14513419568538666, - "rewards/format_reward/mean": 0.01171875, - "rewards/format_reward/std": 0.10772226005792618, - "rewards/tag_count_reward/mean": 0.80908203125, - "rewards/tag_count_reward/std": 0.25034329295158386, + "grad_norm": 72.12713623046875, + "kl": 4.38671875, + "learning_rate": 9.354426401347974e-07, + "loss": 0.2717, + "num_tokens": 518460188.0, + "reward": 0.9765625, + "reward_std": 0.1754680871963501, + "rewards/accuracy_reward/mean": 0.037109375, + "rewards/accuracy_reward/std": 0.18921469151973724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.1810387820005417, "step": 749 }, { @@ -21736,27 +21736,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1904.0, - "completions/mean_length": 707.30859375, - "completions/mean_terminated_length": 652.8088989257812, - "completions/min_length": 13.0, - "completions/min_terminated_length": 13.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 931.974609375, + "completions/mean_terminated_length": 872.26953125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, "epoch": 0.25603823504309975, - "grad_norm": 6.120353698730469, - "kl": 5.9296875, - "learning_rate": 9.349361327963006e-07, - "loss": 0.3421, - "num_tokens": 478238651.0, - "reward": 0.84130859375, - "reward_std": 0.33035802841186523, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.77490234375, - "rewards/tag_count_reward/std": 0.27223074436187744, + "grad_norm": 993.8719482421875, + "kl": 26.9140625, + "learning_rate": 9.351655858475646e-07, + "loss": 1.1859, + "num_tokens": 519012671.0, + "reward": 1.03271484375, + "reward_std": 0.22397825121879578, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.17503775656223297, "step": 750 }, { @@ -21765,27 +21765,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 664.611328125, - "completions/mean_terminated_length": 639.8588256835938, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 940.498046875, + "completions/mean_terminated_length": 918.436279296875, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, "epoch": 0.25637961935649056, - "grad_norm": 1.5641052722930908, - "kl": 3.7265625, - "learning_rate": 9.346581849913004e-07, - "loss": 0.1939, - "num_tokens": 478658340.0, - "reward": 0.91552734375, - "reward_std": 0.3789862394332886, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.81787109375, - "rewards/tag_count_reward/std": 0.2553880512714386, + "grad_norm": 4.655689716339111, + "kl": 1.06103515625, + "learning_rate": 9.348879844733779e-07, + "loss": 0.0985, + "num_tokens": 519573614.0, + "reward": 1.09033203125, + "reward_std": 0.20516103506088257, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.97119140625, + "rewards/tag_count_reward/std": 0.11714621633291245, "step": 751 }, { @@ -21794,27 +21794,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 725.490234375, - "completions/mean_terminated_length": 685.575439453125, - "completions/min_length": 27.0, - "completions/min_terminated_length": 27.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 968.875, + "completions/mean_terminated_length": 931.814208984375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.2567210036698814, - "grad_norm": 1.9023103713989258, - "kl": 4.8203125, - "learning_rate": 9.343796912343617e-07, - "loss": 0.2562, - "num_tokens": 479109839.0, - "reward": 0.83349609375, - "reward_std": 0.3289315700531006, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.78271484375, - "rewards/tag_count_reward/std": 0.264554888010025, + "grad_norm": 13.530494689941406, + "kl": 2.39453125, + "learning_rate": 9.346098364065405e-07, + "loss": 0.1711, + "num_tokens": 520149726.0, + "reward": 1.046875, + "reward_std": 0.19065305590629578, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.958984375, + "rewards/tag_count_reward/std": 0.14764074981212616, "step": 752 }, { @@ -21823,27 +21823,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 719.732421875, - "completions/mean_terminated_length": 682.3915405273438, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1013.578125, + "completions/mean_terminated_length": 967.1346435546875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.2570623879832722, - "grad_norm": 4.725130081176758, - "kl": 5.1640625, - "learning_rate": 9.341006519207551e-07, - "loss": 0.2603, - "num_tokens": 479555462.0, - "reward": 0.84033203125, - "reward_std": 0.33210453391075134, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.01171875, - "rewards/format_reward/std": 0.10772226005792618, - "rewards/tag_count_reward/mean": 0.78369140625, - "rewards/tag_count_reward/std": 0.25549277663230896, + "grad_norm": 5.790446758270264, + "kl": 1.2001953125, + "learning_rate": 9.343311420421323e-07, + "loss": 0.1263, + "num_tokens": 520745798.0, + "reward": 1.02978515625, + "reward_std": 0.2289455235004425, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95947265625, + "rewards/tag_count_reward/std": 0.14315126836299896, "step": 753 }, { @@ -21852,27 +21852,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 731.30859375, - "completions/mean_terminated_length": 697.0060424804688, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 1950.0, + "completions/mean_length": 1003.39453125, + "completions/mean_terminated_length": 963.1358642578125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, "epoch": 0.25740377229666295, - "grad_norm": 3.288994550704956, - "kl": 3.8671875, - "learning_rate": 9.338210674465263e-07, - "loss": 0.2331, - "num_tokens": 480010244.0, - "reward": 0.861328125, - "reward_std": 0.31858742237091064, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.8046875, - "rewards/tag_count_reward/std": 0.2491423636674881, + "grad_norm": 7.176300048828125, + "kl": 0.923828125, + "learning_rate": 9.340519017760093e-07, + "loss": 0.1112, + "num_tokens": 521339888.0, + "reward": 1.0390625, + "reward_std": 0.21267008781433105, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.15950019657611847, "step": 754 }, { @@ -21881,27 +21881,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 694.751953125, - "completions/mean_terminated_length": 675.9940795898438, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 949.740234375, + "completions/mean_terminated_length": 925.6267700195312, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.25774515661005376, - "grad_norm": 2.6497554779052734, - "kl": 4.734375, - "learning_rate": 9.335409382084939e-07, - "loss": 0.2692, - "num_tokens": 480439381.0, - "reward": 0.87548828125, - "reward_std": 0.32045841217041016, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.80517578125, - "rewards/tag_count_reward/std": 0.24878878891468048, + "grad_norm": 6.401724815368652, + "kl": 0.6787109375, + "learning_rate": 9.337721160048028e-07, + "loss": 0.096, + "num_tokens": 521899579.0, + "reward": 1.1005859375, + "reward_std": 0.2047678828239441, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9658203125, + "rewards/tag_count_reward/std": 0.13105663657188416, "step": 755 }, { @@ -21910,27 +21910,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 720.1015625, - "completions/mean_terminated_length": 666.1219482421875, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 965.056640625, + "completions/mean_terminated_length": 925.59716796875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, "epoch": 0.2580865409234446, - "grad_norm": 4.2875871658325195, - "kl": 5.5234375, - "learning_rate": 9.332602646042504e-07, - "loss": 0.3288, - "num_tokens": 480884121.0, - "reward": 0.87646484375, - "reward_std": 0.3499682545661926, - "rewards/accuracy_reward/mean": 0.07459677755832672, - "rewards/accuracy_reward/std": 0.263004869222641, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.79638671875, - "rewards/tag_count_reward/std": 0.2582714557647705, + "grad_norm": 13.514364242553711, + "kl": 0.994140625, + "learning_rate": 9.334917851259187e-07, + "loss": 0.115, + "num_tokens": 522469736.0, + "reward": 1.1015625, + "reward_std": 0.2454531192779541, + "rewards/accuracy_reward/mean": 0.14919355511665344, + "rewards/accuracy_reward/std": 0.3566388487815857, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.15037257969379425, "step": 756 }, { @@ -21939,27 +21939,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 739.109375, - "completions/mean_terminated_length": 720.9663696289062, - "completions/min_length": 14.0, - "completions/min_terminated_length": 14.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1073.171875, + "completions/mean_terminated_length": 1010.3450927734375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, "epoch": 0.2584279252368354, - "grad_norm": 1.863918423652649, - "kl": 3.87890625, - "learning_rate": 9.329790470321607e-07, - "loss": 0.1861, - "num_tokens": 481341153.0, - "reward": 0.84033203125, - "reward_std": 0.3457339107990265, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.78564453125, - "rewards/tag_count_reward/std": 0.26876533031463623, + "grad_norm": 8.269085884094238, + "kl": 1.2099609375, + "learning_rate": 9.332109095375376e-07, + "loss": 0.1442, + "num_tokens": 523097808.0, + "reward": 1.03564453125, + "reward_std": 0.23710045218467712, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94384765625, + "rewards/tag_count_reward/std": 0.1794104278087616, "step": 757 }, { @@ -21968,27 +21968,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1997.0, - "completions/mean_length": 766.458984375, - "completions/mean_terminated_length": 743.52880859375, - "completions/min_length": 57.0, - "completions/min_terminated_length": 57.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1081.7734375, + "completions/mean_terminated_length": 1030.082275390625, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, "epoch": 0.25876930955022615, - "grad_norm": 1.599083423614502, - "kl": 4.2109375, - "learning_rate": 9.326972858913613e-07, - "loss": 0.2422, - "num_tokens": 481813196.0, - "reward": 0.89599609375, - "reward_std": 0.3567178547382355, - "rewards/accuracy_reward/mean": 0.08266129344701767, - "rewards/accuracy_reward/std": 0.2756476104259491, - "rewards/format_reward/mean": 0.01171875, - "rewards/format_reward/std": 0.10772226005792618, - "rewards/tag_count_reward/mean": 0.80419921875, - "rewards/tag_count_reward/std": 0.24949447810649872, + "grad_norm": 4.554450511932373, + "kl": 0.94140625, + "learning_rate": 9.329294896386131e-07, + "loss": 0.0992, + "num_tokens": 523731292.0, + "reward": 1.123046875, + "reward_std": 0.26751840114593506, + "rewards/accuracy_reward/mean": 0.17741934955120087, + "rewards/accuracy_reward/std": 0.38240891695022583, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.951171875, + "rewards/tag_count_reward/std": 0.1612044721841812, "step": 758 }, { @@ -21997,27 +21997,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1809.0, - "completions/mean_length": 669.76953125, - "completions/mean_terminated_length": 656.1775512695312, - "completions/min_length": 16.0, - "completions/min_terminated_length": 16.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 948.291015625, + "completions/mean_terminated_length": 915.1005859375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.25911069386361696, - "grad_norm": 2.088303565979004, - "kl": 4.078125, - "learning_rate": 9.324149815817612e-07, - "loss": 0.1918, - "num_tokens": 482228918.0, - "reward": 0.888671875, - "reward_std": 0.3486194908618927, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.01171875, - "rewards/format_reward/std": 0.10772226005792618, - "rewards/tag_count_reward/mean": 0.806640625, - "rewards/tag_count_reward/std": 0.25740471482276917, + "grad_norm": 6.214273929595947, + "kl": 1.2587890625, + "learning_rate": 9.326475258288729e-07, + "loss": 0.1482, + "num_tokens": 524289617.0, + "reward": 1.078125, + "reward_std": 0.2043112814426422, + "rewards/accuracy_reward/mean": 0.11491935700178146, + "rewards/accuracy_reward/std": 0.3192465901374817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.966796875, + "rewards/tag_count_reward/std": 0.1284831464290619, "step": 759 }, { @@ -22026,27 +22026,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1974.0, - "completions/mean_length": 714.9921875, - "completions/mean_terminated_length": 683.0000610351562, - "completions/min_length": 17.0, - "completions/min_terminated_length": 17.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1032.13671875, + "completions/mean_terminated_length": 975.5835571289062, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.25945207817700777, - "grad_norm": 1.6705163717269897, - "kl": 4.390625, - "learning_rate": 9.321321345040391e-07, - "loss": 0.2625, - "num_tokens": 482675394.0, - "reward": 0.84375, - "reward_std": 0.3496783375740051, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.783203125, - "rewards/tag_count_reward/std": 0.26240473985671997, + "grad_norm": 6.642508029937744, + "kl": 1.2841796875, + "learning_rate": 9.323650185088164e-07, + "loss": 0.181, + "num_tokens": 524898471.0, + "reward": 1.07080078125, + "reward_std": 0.27675095200538635, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94775390625, + "rewards/tag_count_reward/std": 0.16868269443511963, "step": 760 }, { @@ -22055,27 +22055,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1916.0, - "completions/mean_length": 723.802734375, - "completions/mean_terminated_length": 702.7837524414062, - "completions/min_length": 80.0, - "completions/min_terminated_length": 80.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1034.298828125, + "completions/mean_terminated_length": 984.4446411132812, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.2597934624903986, - "grad_norm": 6.62919807434082, - "kl": 3.78515625, - "learning_rate": 9.31848745059645e-07, - "loss": 0.2394, - "num_tokens": 483122077.0, - "reward": 0.892578125, - "reward_std": 0.3438825309276581, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.798828125, - "rewards/tag_count_reward/std": 0.2551976144313812, + "grad_norm": 9.08815860748291, + "kl": 1.4560546875, + "learning_rate": 9.320819680797154e-07, + "loss": 0.1458, + "num_tokens": 525504128.0, + "reward": 1.0888671875, + "reward_std": 0.20690321922302246, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9501953125, + "rewards/tag_count_reward/std": 0.16466166079044342, "step": 761 }, { @@ -22084,27 +22084,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 713.390625, - "completions/mean_terminated_length": 686.8048095703125, - "completions/min_length": 21.0, - "completions/min_terminated_length": 21.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1024.84375, + "completions/mean_terminated_length": 940.4819946289062, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, "epoch": 0.26013484680378934, - "grad_norm": 7.827740669250488, - "kl": 4.15234375, - "learning_rate": 9.315648136507987e-07, - "loss": 0.2751, - "num_tokens": 483560965.0, - "reward": 0.81103515625, - "reward_std": 0.3110986053943634, - "rewards/accuracy_reward/mean": 0.017578125, - "rewards/accuracy_reward/std": 0.13154059648513794, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.78369140625, - "rewards/tag_count_reward/std": 0.26443204283714294, + "grad_norm": 12.516509056091309, + "kl": 2.51953125, + "learning_rate": 9.317983749436133e-07, + "loss": 0.2286, + "num_tokens": 526102480.0, + "reward": 1.015625, + "reward_std": 0.27147382497787476, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.2059757262468338, "step": 762 }, { @@ -22113,27 +22113,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 718.63671875, - "completions/mean_terminated_length": 670.1984252929688, - "completions/min_length": 22.0, - "completions/min_terminated_length": 22.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 954.794921875, + "completions/mean_terminated_length": 919.5302124023438, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.26047623111718016, - "grad_norm": 3.6159653663635254, - "kl": 5.1484375, - "learning_rate": 9.312803406804882e-07, - "loss": 0.2669, - "num_tokens": 484009371.0, - "reward": 0.833984375, - "reward_std": 0.33764201402664185, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.01171875, - "rewards/format_reward/std": 0.10772226005792618, - "rewards/tag_count_reward/mean": 0.771484375, - "rewards/tag_count_reward/std": 0.2691352665424347, + "grad_norm": 9.569135665893555, + "kl": 1.4814453125, + "learning_rate": 9.315142395033239e-07, + "loss": 0.1629, + "num_tokens": 526671799.0, + "reward": 1.0927734375, + "reward_std": 0.203653484582901, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9677734375, + "rewards/tag_count_reward/std": 0.13061843812465668, "step": 763 }, { @@ -22142,27 +22142,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 700.111328125, - "completions/mean_terminated_length": 675.9940185546875, - "completions/min_length": 9.0, - "completions/min_terminated_length": 9.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 989.978515625, + "completions/mean_terminated_length": 960.2349243164062, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, "epoch": 0.26081761543057097, - "grad_norm": 2.015320301055908, - "kl": 4.87109375, - "learning_rate": 9.309953265524714e-07, - "loss": 0.2861, - "num_tokens": 484439188.0, - "reward": 0.84326171875, - "reward_std": 0.3085383474826813, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.015625, - "rewards/format_reward/std": 0.12414088100194931, - "rewards/tag_count_reward/mean": 0.80224609375, - "rewards/tag_count_reward/std": 0.25619494915008545, + "grad_norm": 3.776266098022461, + "kl": 1.2421875, + "learning_rate": 9.312295621624317e-07, + "loss": 0.1527, + "num_tokens": 527250028.0, + "reward": 1.03271484375, + "reward_std": 0.17305736243724823, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95849609375, + "rewards/tag_count_reward/std": 0.15199612081050873, "step": 764 }, { @@ -22171,27 +22171,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 726.439453125, - "completions/mean_terminated_length": 716.033447265625, - "completions/min_length": 11.0, - "completions/min_terminated_length": 11.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1071.771484375, + "completions/mean_terminated_length": 1021.6571044921875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, "epoch": 0.2611589997439618, - "grad_norm": 2.472776174545288, - "kl": 4.51953125, - "learning_rate": 9.307097716712735e-07, - "loss": 0.2319, - "num_tokens": 484882725.0, - "reward": 0.822265625, - "reward_std": 0.2942245602607727, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.791015625, - "rewards/tag_count_reward/std": 0.25847160816192627, + "grad_norm": 22.29033088684082, + "kl": 2.4140625, + "learning_rate": 9.309443433252904e-07, + "loss": 0.1926, + "num_tokens": 527870375.0, + "reward": 1.00537109375, + "reward_std": 0.19676750898361206, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94873046875, + "rewards/tag_count_reward/std": 0.16970491409301758, "step": 765 }, { @@ -22200,27 +22200,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1962.0, - "completions/mean_length": 738.49609375, - "completions/mean_terminated_length": 707.0680541992188, - "completions/min_length": 53.0, - "completions/min_terminated_length": 53.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 1038.96484375, + "completions/mean_terminated_length": 960.3662719726562, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, "epoch": 0.26150038405735254, - "grad_norm": 1.8773621320724487, - "kl": 5.1875, - "learning_rate": 9.304236764421876e-07, - "loss": 0.3341, - "num_tokens": 485344787.0, - "reward": 0.8388671875, - "reward_std": 0.36534199118614197, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.7646484375, - "rewards/tag_count_reward/std": 0.27852046489715576, + "grad_norm": 8.716938018798828, + "kl": 3.26171875, + "learning_rate": 9.306585833970237e-07, + "loss": 0.2634, + "num_tokens": 528486277.0, + "reward": 1.08203125, + "reward_std": 0.2926439940929413, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.2005603015422821, "step": 766 }, { @@ -22229,27 +22229,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 719.046875, - "completions/mean_terminated_length": 689.8682861328125, - "completions/min_length": 31.0, - "completions/min_terminated_length": 31.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1021.5859375, + "completions/mean_terminated_length": 984.186279296875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, "epoch": 0.26184176837074336, - "grad_norm": 3.4017436504364014, - "kl": 4.6015625, - "learning_rate": 9.301370412712733e-07, - "loss": 0.2944, - "num_tokens": 485790027.0, - "reward": 0.88427734375, - "reward_std": 0.32861924171447754, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.81005859375, - "rewards/tag_count_reward/std": 0.25399237871170044, + "grad_norm": 7.548423767089844, + "kl": 2.515625, + "learning_rate": 9.303722827835229e-07, + "loss": 0.1813, + "num_tokens": 529086417.0, + "reward": 1.1357421875, + "reward_std": 0.23893500864505768, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9560546875, + "rewards/tag_count_reward/std": 0.1408405750989914, "step": 767 }, { @@ -22258,27 +22258,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 728.86328125, - "completions/mean_terminated_length": 707.9246215820312, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1050.638671875, + "completions/mean_terminated_length": 1018.4656982421875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.26218315268413417, - "grad_norm": 5.394140720367432, - "kl": 4.59375, - "learning_rate": 9.29849866565357e-07, - "loss": 0.3168, - "num_tokens": 486238709.0, - "reward": 0.8681640625, - "reward_std": 0.3399982750415802, - "rewards/accuracy_reward/mean": 0.060483869165182114, - "rewards/accuracy_reward/std": 0.2386218160390854, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.8056640625, - "rewards/tag_count_reward/std": 0.2576180696487427, + "grad_norm": 26.18227195739746, + "kl": 2.44091796875, + "learning_rate": 9.30085441891448e-07, + "loss": 0.1607, + "num_tokens": 529699848.0, + "reward": 1.07275390625, + "reward_std": 0.24135644733905792, + "rewards/accuracy_reward/mean": 0.12298387289047241, + "rewards/accuracy_reward/std": 0.32875028252601624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95361328125, + "rewards/tag_count_reward/std": 0.15299242734909058, "step": 768 }, { @@ -22287,27 +22287,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1964.0, - "completions/mean_length": 689.662109375, - "completions/mean_terminated_length": 668.1012573242188, - "completions/min_length": 47.0, - "completions/min_terminated_length": 47.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 967.138671875, + "completions/mean_terminated_length": 943.4071655273438, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.262524536997525, - "grad_norm": 14.306845664978027, - "kl": 6.515625, - "learning_rate": 9.295621527320305e-07, - "loss": 0.3435, - "num_tokens": 486661656.0, - "reward": 0.82373046875, - "reward_std": 0.3080504536628723, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.78857421875, - "rewards/tag_count_reward/std": 0.26329857110977173, + "grad_norm": 8.040897369384766, + "kl": 1.3896484375, + "learning_rate": 9.297980611282259e-07, + "loss": 0.1224, + "num_tokens": 530264863.0, + "reward": 1.02978515625, + "reward_std": 0.14422252774238586, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96728515625, + "rewards/tag_count_reward/std": 0.11923210322856903, "step": 769 }, { @@ -22316,27 +22316,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 808.90234375, - "completions/mean_terminated_length": 761.1480712890625, - "completions/min_length": 31.0, - "completions/min_terminated_length": 31.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1104.115234375, + "completions/mean_terminated_length": 1059.7198486328125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.26286592131091574, - "grad_norm": 12.381365776062012, - "kl": 7.7734375, - "learning_rate": 9.292739001796513e-07, - "loss": 0.4548, - "num_tokens": 487157414.0, - "reward": 0.82861328125, - "reward_std": 0.33342158794403076, - "rewards/accuracy_reward/mean": 0.04233871027827263, - "rewards/accuracy_reward/std": 0.2015640139579773, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.77978515625, - "rewards/tag_count_reward/std": 0.2685697376728058, + "grad_norm": 3.894073486328125, + "kl": 1.767578125, + "learning_rate": 9.295101409020507e-07, + "loss": 0.1473, + "num_tokens": 530911770.0, + "reward": 1.048828125, + "reward_std": 0.19878928363323212, + "rewards/accuracy_reward/mean": 0.09072580933570862, + "rewards/accuracy_reward/std": 0.2875087857246399, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1396721750497818, "step": 770 }, { @@ -22345,27 +22345,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 741.685546875, - "completions/mean_terminated_length": 710.3340454101562, - "completions/min_length": 46.0, - "completions/min_terminated_length": 46.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1033.3046875, + "completions/mean_terminated_length": 1011.0259399414062, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.26320730562430655, - "grad_norm": 7.73433256149292, - "kl": 7.0625, - "learning_rate": 9.289851093173408e-07, - "loss": 0.4275, - "num_tokens": 487621301.0, - "reward": 0.86376953125, - "reward_std": 0.34932851791381836, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.77783203125, - "rewards/tag_count_reward/std": 0.27014127373695374, + "grad_norm": 1.7520126104354858, + "kl": 0.48583984375, + "learning_rate": 9.292216816218826e-07, + "loss": 0.0527, + "num_tokens": 531524966.0, + "reward": 1.154296875, + "reward_std": 0.21976973116397858, + "rewards/accuracy_reward/mean": 0.18359375, + "rewards/accuracy_reward/std": 0.3875311613082886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.970703125, + "rewards/tag_count_reward/std": 0.12163712829351425, "step": 771 }, { @@ -22374,27 +22374,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1960.0, - "completions/mean_length": 734.646484375, - "completions/mean_terminated_length": 689.5414428710938, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1003.591796875, + "completions/mean_terminated_length": 982.786865234375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, "epoch": 0.26354868993769737, - "grad_norm": 1.9583275318145752, - "kl": 5.8359375, - "learning_rate": 9.286957805549849e-07, - "loss": 0.3714, - "num_tokens": 488070240.0, - "reward": 0.84033203125, - "reward_std": 0.3109779357910156, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.80126953125, - "rewards/tag_count_reward/std": 0.26018086075782776, + "grad_norm": 3.8183629512786865, + "kl": 0.82421875, + "learning_rate": 9.289326836974474e-07, + "loss": 0.101, + "num_tokens": 532111605.0, + "reward": 1.04296875, + "reward_std": 0.18722647428512573, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.12506113946437836, "step": 772 }, { @@ -22403,27 +22403,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 741.8125, - "completions/mean_terminated_length": 705.0923461914062, - "completions/min_length": 84.0, - "completions/min_terminated_length": 84.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 1022.734375, + "completions/mean_terminated_length": 1006.4603881835938, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, "epoch": 0.2638900742510882, - "grad_norm": 5.635120868682861, - "kl": 4.5234375, - "learning_rate": 9.284059143032329e-07, - "loss": 0.3181, - "num_tokens": 488524656.0, - "reward": 0.83935546875, - "reward_std": 0.28641611337661743, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, + "grad_norm": 1.6211934089660645, + "kl": 0.93359375, + "learning_rate": 9.286431475392363e-07, + "loss": 0.0695, + "num_tokens": 532709853.0, + "reward": 1.02685546875, + "reward_std": 0.1864224076271057, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.81005859375, - "rewards/tag_count_reward/std": 0.25351038575172424, + "rewards/tag_count_reward/mean": 0.97216796875, + "rewards/tag_count_reward/std": 0.11527974903583527, "step": 773 }, { @@ -22432,27 +22432,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1923.0, - "completions/mean_length": 771.77734375, - "completions/mean_terminated_length": 714.4775390625, - "completions/min_length": 77.0, - "completions/min_terminated_length": 77.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 991.841796875, + "completions/mean_terminated_length": 968.6527099609375, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, "epoch": 0.26423145856447894, - "grad_norm": 4.567211627960205, - "kl": 4.578125, - "learning_rate": 9.281155109734971e-07, - "loss": 0.37, - "num_tokens": 488997774.0, - "reward": 0.908203125, - "reward_std": 0.3757338225841522, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.806640625, - "rewards/tag_count_reward/std": 0.2635094225406647, + "grad_norm": 3.4774885177612305, + "kl": 1.0576171875, + "learning_rate": 9.283530735585044e-07, + "loss": 0.094, + "num_tokens": 533295644.0, + "reward": 1.19580078125, + "reward_std": 0.2450578808784485, + "rewards/accuracy_reward/mean": 0.220703125, + "rewards/accuracy_reward/std": 0.4151262938976288, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.97509765625, + "rewards/tag_count_reward/std": 0.10717868059873581, "step": 774 }, { @@ -22461,27 +22461,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1940.0, - "completions/mean_length": 739.1953125, - "completions/mean_terminated_length": 723.6759033203125, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 1026.12109375, + "completions/mean_terminated_length": 1020.0982666015625, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, "epoch": 0.26457284287786975, - "grad_norm": 4.73957633972168, - "kl": 4.078125, - "learning_rate": 9.278245709779515e-07, - "loss": 0.2605, - "num_tokens": 489447586.0, - "reward": 0.9208984375, - "reward_std": 0.3388001322746277, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, + "grad_norm": 1.5099610090255737, + "kl": 0.52685546875, + "learning_rate": 9.280624621672716e-07, + "loss": 0.0325, + "num_tokens": 533892362.0, + "reward": 1.1474609375, + "reward_std": 0.20763105154037476, + "rewards/accuracy_reward/mean": 0.169921875, + "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8232421875, - "rewards/tag_count_reward/std": 0.24331942200660706, + "rewards/tag_count_reward/mean": 0.9775390625, + "rewards/tag_count_reward/std": 0.09505070745944977, "step": 775 }, { @@ -22490,27 +22490,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 765.951171875, - "completions/mean_terminated_length": 713.8353271484375, - "completions/min_length": 101.0, - "completions/min_terminated_length": 101.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1008.173828125, + "completions/mean_terminated_length": 978.9417114257812, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, "epoch": 0.26491422719126057, - "grad_norm": 2.601271390914917, - "kl": 5.890625, - "learning_rate": 9.275330947295326e-07, - "loss": 0.4168, - "num_tokens": 489914457.0, - "reward": 0.85888671875, - "reward_std": 0.2817559242248535, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.82568359375, - "rewards/tag_count_reward/std": 0.2361827939748764, + "grad_norm": 5.090696334838867, + "kl": 1.26708984375, + "learning_rate": 9.2777131377832e-07, + "loss": 0.132, + "num_tokens": 534483251.0, + "reward": 1.0244140625, + "reward_std": 0.19809219241142273, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9619140625, + "rewards/tag_count_reward/std": 0.13994215428829193, "step": 776 }, { @@ -22519,27 +22519,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 741.03515625, - "completions/mean_terminated_length": 709.6680297851562, - "completions/min_length": 68.0, - "completions/min_terminated_length": 68.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1014.541015625, + "completions/mean_terminated_length": 985.4879150390625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.2652556115046514, - "grad_norm": 6.369903564453125, - "kl": 7.1171875, - "learning_rate": 9.272410826419374e-07, - "loss": 0.4342, - "num_tokens": 490373355.0, - "reward": 0.8427734375, - "reward_std": 0.29890674352645874, - "rewards/accuracy_reward/mean": 0.021484375, - "rewards/accuracy_reward/std": 0.14513419568538666, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.8173828125, - "rewards/tag_count_reward/std": 0.2562345564365387, + "grad_norm": 4.278371334075928, + "kl": 0.99267578125, + "learning_rate": 9.274796288051956e-07, + "loss": 0.1079, + "num_tokens": 535082184.0, + "reward": 1.037109375, + "reward_std": 0.21481657028198242, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.962890625, + "rewards/tag_count_reward/std": 0.1393296867609024, "step": 777 }, { @@ -22548,27 +22548,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 734.990234375, - "completions/mean_terminated_length": 706.1616821289062, - "completions/min_length": 10.0, - "completions/min_terminated_length": 10.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1022.33984375, + "completions/mean_terminated_length": 1010.1779174804688, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, "epoch": 0.26559699581804214, - "grad_norm": 11.346620559692383, - "kl": 7.3046875, - "learning_rate": 9.269485351296239e-07, - "loss": 0.3906, - "num_tokens": 490827590.0, - "reward": 0.80712890625, - "reward_std": 0.29944953322410583, - "rewards/accuracy_reward/mean": 0.021484375, - "rewards/accuracy_reward/std": 0.14513419568538666, + "grad_norm": 1.9273333549499512, + "kl": 0.560546875, + "learning_rate": 9.271874076622057e-07, + "loss": 0.0594, + "num_tokens": 535683542.0, + "reward": 1.046875, + "reward_std": 0.201980859041214, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.78564453125, - "rewards/tag_count_reward/std": 0.26830989122390747, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.10367217659950256, "step": 778 }, { @@ -22577,27 +22577,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 719.546875, - "completions/mean_terminated_length": 679.4526977539062, - "completions/min_length": 21.0, - "completions/min_terminated_length": 21.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 980.490234375, + "completions/mean_terminated_length": 941.5931396484375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.26593838013143295, - "grad_norm": 9.743428230285645, - "kl": 7.1875, - "learning_rate": 9.266554526078095e-07, - "loss": 0.4057, - "num_tokens": 491274734.0, - "reward": 0.92529296875, - "reward_std": 0.3917638063430786, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.80810546875, - "rewards/tag_count_reward/std": 0.2510598301887512, + "grad_norm": 2.6369338035583496, + "kl": 1.66015625, + "learning_rate": 9.268946507644197e-07, + "loss": 0.1327, + "num_tokens": 536264289.0, + "reward": 1.13720703125, + "reward_std": 0.2692152261734009, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95751953125, + "rewards/tag_count_reward/std": 0.149287611246109, "step": 779 }, { @@ -22606,27 +22606,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1753.0, - "completions/mean_length": 673.9921875, - "completions/mean_terminated_length": 649.4075317382812, - "completions/min_length": 35.0, - "completions/min_terminated_length": 35.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 933.021484375, + "completions/mean_terminated_length": 924.2421264648438, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, "epoch": 0.26627976444482376, - "grad_norm": 3.9403653144836426, - "kl": 5.1328125, - "learning_rate": 9.263618354924714e-07, - "loss": 0.275, - "num_tokens": 491692586.0, - "reward": 0.8759765625, - "reward_std": 0.2943779528141022, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.8349609375, - "rewards/tag_count_reward/std": 0.2504563331604004, + "grad_norm": 2.2521584033966064, + "kl": 0.8876953125, + "learning_rate": 9.266013585276678e-07, + "loss": 0.0929, + "num_tokens": 536814764.0, + "reward": 1.10400390625, + "reward_std": 0.2229701280593872, + "rewards/accuracy_reward/mean": 0.13306452333927155, + "rewards/accuracy_reward/std": 0.3399873375892639, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.97509765625, + "rewards/tag_count_reward/std": 0.1060313731431961, "step": 780 }, { @@ -22635,27 +22635,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1956.0, - "completions/mean_length": 742.46484375, - "completions/mean_terminated_length": 716.4581909179688, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1002.607421875, + "completions/mean_terminated_length": 977.51806640625, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.2666211487582146, - "grad_norm": 3.920292854309082, - "kl": 4.37890625, - "learning_rate": 9.260676842003453e-07, - "loss": 0.2484, - "num_tokens": 492146296.0, - "reward": 0.9189453125, - "reward_std": 0.314945787191391, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.8330078125, - "rewards/tag_count_reward/std": 0.2421857714653015, + "grad_norm": 2.152036428451538, + "kl": 1.15234375, + "learning_rate": 9.263075313685405e-07, + "loss": 0.1356, + "num_tokens": 537401667.0, + "reward": 1.08251953125, + "reward_std": 0.17819756269454956, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96923828125, + "rewards/tag_count_reward/std": 0.12475418299436569, "step": 781 }, { @@ -22664,27 +22664,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 750.123046875, - "completions/mean_terminated_length": 702.83203125, - "completions/min_length": 67.0, - "completions/min_terminated_length": 67.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1007.265625, + "completions/mean_terminated_length": 984.4151611328125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, "epoch": 0.26696253307160533, - "grad_norm": 8.555292129516602, - "kl": 3.08984375, - "learning_rate": 9.257729991489252e-07, - "loss": 0.2254, - "num_tokens": 492605319.0, - "reward": 0.9150390625, - "reward_std": 0.3154516816139221, - "rewards/accuracy_reward/mean": 0.06854838877916336, - "rewards/accuracy_reward/std": 0.25293970108032227, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.8447265625, - "rewards/tag_count_reward/std": 0.2352529615163803, + "grad_norm": 2.3568570613861084, + "kl": 1.49267578125, + "learning_rate": 9.260131697043882e-07, + "loss": 0.1556, + "num_tokens": 537992347.0, + "reward": 1.056640625, + "reward_std": 0.16587641835212708, + "rewards/accuracy_reward/mean": 0.09677419066429138, + "rewards/accuracy_reward/std": 0.2959485352039337, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.962890625, + "rewards/tag_count_reward/std": 0.1254730522632599, "step": 782 }, { @@ -22693,27 +22693,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1943.0, - "completions/mean_length": 778.810546875, - "completions/mean_terminated_length": 748.3500366210938, - "completions/min_length": 94.0, - "completions/min_terminated_length": 94.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1002.91015625, + "completions/mean_terminated_length": 982.0916748046875, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, "epoch": 0.26730391738499615, - "grad_norm": 13.338995933532715, - "kl": 3.14453125, - "learning_rate": 9.254777807564626e-07, - "loss": 0.2634, - "num_tokens": 493078374.0, - "reward": 0.88623046875, - "reward_std": 0.3087897002696991, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, + "grad_norm": 4.647830009460449, + "kl": 1.54296875, + "learning_rate": 9.257182739533203e-07, + "loss": 0.1065, + "num_tokens": 538580141.0, + "reward": 1.05908203125, + "reward_std": 0.18386191129684448, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.83544921875, - "rewards/tag_count_reward/std": 0.24808107316493988, + "rewards/tag_count_reward/mean": 0.96142578125, + "rewards/tag_count_reward/std": 0.1349107027053833, "step": 783 }, { @@ -22724,25 +22724,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 717.474609375, - "completions/mean_terminated_length": 674.554443359375, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 921.39453125, + "completions/mean_terminated_length": 885.0523681640625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.26764530169838696, - "grad_norm": 11.409294128417969, - "kl": 2.8984375, - "learning_rate": 9.251820294419661e-07, - "loss": 0.2482, - "num_tokens": 493520121.0, - "reward": 0.9228515625, - "reward_std": 0.33445990085601807, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.8447265625, - "rewards/tag_count_reward/std": 0.23421084880828857, + "grad_norm": 8.755029678344727, + "kl": 2.267578125, + "learning_rate": 9.254228445342056e-07, + "loss": 0.1518, + "num_tokens": 539126295.0, + "reward": 1.0888671875, + "reward_std": 0.252022922039032, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9599609375, + "rewards/tag_count_reward/std": 0.14456358551979065, "step": 784 }, { @@ -22751,27 +22751,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 707.994140625, - "completions/mean_terminated_length": 678.5728759765625, - "completions/min_length": 84.0, - "completions/min_terminated_length": 84.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 980.40234375, + "completions/mean_terminated_length": 961.3001708984375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.2679866860117778, - "grad_norm": 8.545931816101074, - "kl": 2.40234375, - "learning_rate": 9.248857456252005e-07, - "loss": 0.1561, - "num_tokens": 493955734.0, - "reward": 0.935546875, - "reward_std": 0.26695406436920166, - "rewards/accuracy_reward/mean": 0.08870967477560043, - "rewards/accuracy_reward/std": 0.284611314535141, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.84765625, - "rewards/tag_count_reward/std": 0.23036254942417145, + "grad_norm": 7.662311553955078, + "kl": 1.6728515625, + "learning_rate": 9.251268818666695e-07, + "loss": 0.103, + "num_tokens": 539701381.0, + "reward": 1.07080078125, + "reward_std": 0.14887943863868713, + "rewards/accuracy_reward/mean": 0.11088709533214569, + "rewards/accuracy_reward/std": 0.3143092691898346, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96337890625, + "rewards/tag_count_reward/std": 0.12610261142253876, "step": 785 }, { @@ -22780,27 +22780,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 773.412109375, - "completions/mean_terminated_length": 740.2064208984375, - "completions/min_length": 93.0, - "completions/min_terminated_length": 93.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 958.181640625, + "completions/mean_terminated_length": 938.681884765625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, "epoch": 0.26832807032516853, - "grad_norm": 2.3647966384887695, - "kl": 3.6171875, - "learning_rate": 9.245889297266866e-07, - "loss": 0.2264, - "num_tokens": 494425017.0, - "reward": 0.90771484375, - "reward_std": 0.319466769695282, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.84130859375, - "rewards/tag_count_reward/std": 0.23216763138771057, + "grad_norm": 1.6267530918121338, + "kl": 1.166015625, + "learning_rate": 9.248303863710965e-07, + "loss": 0.1044, + "num_tokens": 540265266.0, + "reward": 1.1005859375, + "reward_std": 0.22841449081897736, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9658203125, + "rewards/tag_count_reward/std": 0.1253320872783661, "step": 786 }, { @@ -22809,27 +22809,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 779.95703125, - "completions/mean_terminated_length": 717.59423828125, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 947.267578125, + "completions/mean_terminated_length": 934.2154541015625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.26866945463855935, - "grad_norm": 4.059122085571289, - "kl": 6.2890625, - "learning_rate": 9.242915821677001e-07, - "loss": 0.3614, - "num_tokens": 494898771.0, - "reward": 0.89306640625, - "reward_std": 0.33708655834198, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, + "grad_norm": 1.436933159828186, + "kl": 0.96484375, + "learning_rate": 9.245333584686264e-07, + "loss": 0.105, + "num_tokens": 540824683.0, + "reward": 1.140625, + "reward_std": 0.25158441066741943, + "rewards/accuracy_reward/mean": 0.166015625, + "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.80712890625, - "rewards/tag_count_reward/std": 0.25942689180374146, + "rewards/tag_count_reward/mean": 0.974609375, + "rewards/tag_count_reward/std": 0.1064911037683487, "step": 787 }, { @@ -22838,27 +22838,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.00390625, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 680.892578125, - "completions/mean_terminated_length": 675.5314331054688, - "completions/min_length": 35.0, - "completions/min_terminated_length": 35.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 926.09375, + "completions/mean_terminated_length": 906.0198364257812, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.26901083895195016, - "grad_norm": 3.4603018760681152, - "kl": 4.7109375, - "learning_rate": 9.239937033702717e-07, - "loss": 0.2414, - "num_tokens": 495322892.0, - "reward": 0.9091796875, - "reward_std": 0.3355030119419098, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.8310546875, - "rewards/tag_count_reward/std": 0.25464847683906555, + "grad_norm": 9.073989868164062, + "kl": 2.3349609375, + "learning_rate": 9.242357985811562e-07, + "loss": 0.1738, + "num_tokens": 541374347.0, + "reward": 1.1435546875, + "reward_std": 0.24310924112796783, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9716796875, + "rewards/tag_count_reward/std": 0.11778579652309418, "step": 788 }, { @@ -22867,27 +22867,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 695.173828125, - "completions/mean_terminated_length": 662.7060546875, - "completions/min_length": 57.0, - "completions/min_terminated_length": 57.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 922.802734375, + "completions/mean_terminated_length": 907.2059936523438, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, "epoch": 0.269352223265341, - "grad_norm": 3.6373889446258545, - "kl": 5.390625, - "learning_rate": 9.236952937571856e-07, - "loss": 0.3142, - "num_tokens": 495755109.0, - "reward": 0.95166015625, - "reward_std": 0.3479437530040741, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.83251953125, - "rewards/tag_count_reward/std": 0.2490728348493576, + "grad_norm": 4.688058376312256, + "kl": 1.59375, + "learning_rate": 9.239377071313381e-07, + "loss": 0.1196, + "num_tokens": 541923110.0, + "reward": 1.1357421875, + "reward_std": 0.23167961835861206, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9677734375, + "rewards/tag_count_reward/std": 0.11358989775180817, "step": 789 }, { @@ -22898,25 +22898,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 745.138671875, - "completions/mean_terminated_length": 721.8270263671875, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 978.75390625, + "completions/mean_terminated_length": 959.6222534179688, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, "epoch": 0.26969360757873173, - "grad_norm": 7.922231674194336, - "kl": 4.96875, - "learning_rate": 9.233963537519799e-07, - "loss": 0.2474, - "num_tokens": 496211196.0, - "reward": 0.947265625, - "reward_std": 0.3379775583744049, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.845703125, - "rewards/tag_count_reward/std": 0.23949779570102692, + "grad_norm": 8.775617599487305, + "kl": 2.291015625, + "learning_rate": 9.236390845425797e-07, + "loss": 0.162, + "num_tokens": 542498808.0, + "reward": 1.14697265625, + "reward_std": 0.25526946783065796, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3810062110424042, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.97119140625, + "rewards/tag_count_reward/std": 0.11818567663431168, "step": 790 }, { @@ -22927,25 +22927,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1910.0, - "completions/mean_length": 661.783203125, - "completions/mean_terminated_length": 642.568359375, - "completions/min_length": 74.0, - "completions/min_terminated_length": 74.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 920.056640625, + "completions/mean_terminated_length": 904.4218139648438, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.27003499189212254, - "grad_norm": 3.0672128200531006, - "kl": 3.75390625, - "learning_rate": 9.230968837789451e-07, - "loss": 0.2257, - "num_tokens": 496615261.0, - "reward": 0.90283203125, - "reward_std": 0.29471856355667114, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.84033203125, - "rewards/tag_count_reward/std": 0.24284131824970245, + "grad_norm": 2.28690505027771, + "kl": 1.61328125, + "learning_rate": 9.233399312390427e-07, + "loss": 0.1422, + "num_tokens": 543035109.0, + "reward": 1.11962890625, + "reward_std": 0.21681568026542664, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96728515625, + "rewards/tag_count_reward/std": 0.11923210322856903, "step": 791 }, { @@ -22954,27 +22954,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1746.0, - "completions/mean_length": 734.173828125, - "completions/mean_terminated_length": 686.3016357421875, - "completions/min_length": 19.0, - "completions/min_terminated_length": 19.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 984.072265625, + "completions/mean_terminated_length": 971.45654296875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, "epoch": 0.27037637620551336, - "grad_norm": 1.4163967370986938, - "kl": 4.34765625, - "learning_rate": 9.227968842631243e-07, - "loss": 0.2629, - "num_tokens": 497067414.0, - "reward": 0.865234375, - "reward_std": 0.2757364511489868, - "rewards/accuracy_reward/mean": 0.021484375, - "rewards/accuracy_reward/std": 0.14513419568538666, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.8359375, - "rewards/tag_count_reward/std": 0.24014326930046082, + "grad_norm": 2.2115464210510254, + "kl": 1.134765625, + "learning_rate": 9.230402476456424e-07, + "loss": 0.0898, + "num_tokens": 543615210.0, + "reward": 1.02392578125, + "reward_std": 0.1776273399591446, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96728515625, + "rewards/tag_count_reward/std": 0.118201844394207, "step": 792 }, { @@ -22983,27 +22983,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 757.34375, - "completions/mean_terminated_length": 731.6334838867188, - "completions/min_length": 43.0, - "completions/min_terminated_length": 43.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 975.880859375, + "completions/mean_terminated_length": 952.34130859375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.27071776051890417, - "grad_norm": 4.963979244232178, - "kl": 3.953125, - "learning_rate": 9.224963556303116e-07, - "loss": 0.2585, - "num_tokens": 497534902.0, - "reward": 0.9013671875, - "reward_std": 0.3063977062702179, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.8408203125, - "rewards/tag_count_reward/std": 0.23627431690692902, + "grad_norm": 4.202908992767334, + "kl": 1.966796875, + "learning_rate": 9.22740034188048e-07, + "loss": 0.1393, + "num_tokens": 544194589.0, + "reward": 1.08740234375, + "reward_std": 0.22083085775375366, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95654296875, + "rewards/tag_count_reward/std": 0.14142537117004395, "step": 793 }, { @@ -23012,27 +23012,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1953.0, - "completions/mean_length": 717.626953125, - "completions/mean_terminated_length": 701.851806640625, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 938.03515625, + "completions/mean_terminated_length": 902.2297973632812, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, "epoch": 0.27105914483229493, - "grad_norm": 2.173969030380249, - "kl": 3.97265625, - "learning_rate": 9.221952983070526e-07, - "loss": 0.2175, - "num_tokens": 497992423.0, - "reward": 0.8916015625, - "reward_std": 0.28696465492248535, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.8427734375, - "rewards/tag_count_reward/std": 0.23603153228759766, + "grad_norm": 17.585086822509766, + "kl": 1.8251953125, + "learning_rate": 9.224392912926809e-07, + "loss": 0.1545, + "num_tokens": 544764959.0, + "reward": 1.04248046875, + "reward_std": 0.18668007850646973, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96826171875, + "rewards/tag_count_reward/std": 0.12051546573638916, "step": 794 }, { @@ -23043,25 +23043,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 729.138671875, - "completions/mean_terminated_length": 702.8665771484375, - "completions/min_length": 9.0, - "completions/min_terminated_length": 9.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1014.638671875, + "completions/mean_terminated_length": 994.0538330078125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.27140052914568574, - "grad_norm": 2.111781358718872, - "kl": 4.3125, - "learning_rate": 9.218937127206432e-07, - "loss": 0.2517, - "num_tokens": 498433902.0, - "reward": 0.94580078125, - "reward_std": 0.3448649048805237, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.85009765625, - "rewards/tag_count_reward/std": 0.232225239276886, + "grad_norm": 3.2779500484466553, + "kl": 0.9375, + "learning_rate": 9.221380193867144e-07, + "loss": 0.0841, + "num_tokens": 545352614.0, + "reward": 1.16796875, + "reward_std": 0.2738982141017914, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.39980348944664, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.12114909291267395, "step": 795 }, { @@ -23070,27 +23070,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1945.0, - "completions/mean_length": 734.720703125, - "completions/mean_terminated_length": 708.5597534179688, - "completions/min_length": 51.0, - "completions/min_terminated_length": 51.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1015.81640625, + "completions/mean_terminated_length": 999.4326171875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.27174191345907656, - "grad_norm": 1.4413419961929321, - "kl": 3.984375, - "learning_rate": 9.215915992991289e-07, - "loss": 0.2238, - "num_tokens": 498886735.0, - "reward": 0.88037109375, - "reward_std": 0.2774122357368469, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.84326171875, - "rewards/tag_count_reward/std": 0.22441160678863525, + "grad_norm": 1.4899282455444336, + "kl": 0.73583984375, + "learning_rate": 9.218362188980732e-07, + "loss": 0.0689, + "num_tokens": 545949368.0, + "reward": 1.029296875, + "reward_std": 0.21988213062286377, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.12487763166427612, "step": 796 }, { @@ -23099,27 +23099,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1881.0, - "completions/mean_length": 697.185546875, - "completions/mean_terminated_length": 678.46142578125, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 904.7890625, + "completions/mean_terminated_length": 879.6885986328125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.27208329777246737, - "grad_norm": 4.815097332000732, - "kl": 3.9296875, - "learning_rate": 9.212889584713044e-07, - "loss": 0.2891, - "num_tokens": 499322286.0, - "reward": 1.0068359375, - "reward_std": 0.3376759886741638, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.8701171875, - "rewards/tag_count_reward/std": 0.2233344167470932, + "grad_norm": 3.2160911560058594, + "kl": 1.11083984375, + "learning_rate": 9.215338902554335e-07, + "loss": 0.1037, + "num_tokens": 546491212.0, + "reward": 1.166015625, + "reward_std": 0.2668801248073578, + "rewards/accuracy_reward/mean": 0.201171875, + "rewards/accuracy_reward/std": 0.4012683033943176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.13357339799404144, "step": 797 }, { @@ -23128,27 +23128,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1962.0, - "completions/mean_length": 746.00390625, - "completions/mean_terminated_length": 695.8255615234375, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, - "epoch": 0.2724246820858581, - "grad_norm": 2.2542262077331543, - "kl": 5.3984375, - "learning_rate": 9.20985790666713e-07, - "loss": 0.353, - "num_tokens": 499781520.0, - "reward": 0.91015625, - "reward_std": 0.31544923782348633, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.84375, - "rewards/tag_count_reward/std": 0.24975526332855225, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 928.353515625, + "completions/mean_terminated_length": 906.0498046875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.2724246820858581, + "grad_norm": 3.320685386657715, + "kl": 0.61376953125, + "learning_rate": 9.212310338882207e-07, + "loss": 0.0856, + "num_tokens": 547043809.0, + "reward": 1.10009765625, + "reward_std": 0.1896565556526184, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96923828125, + "rewards/tag_count_reward/std": 0.12277772277593613, "step": 798 }, { @@ -23157,27 +23157,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 790.234375, - "completions/mean_terminated_length": 736.43994140625, - "completions/min_length": 32.0, - "completions/min_terminated_length": 32.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 997.97265625, + "completions/mean_terminated_length": 979.1848754882812, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, "epoch": 0.27276606639924894, - "grad_norm": 1.8710473775863647, - "kl": 6.328125, - "learning_rate": 9.20682096315646e-07, - "loss": 0.4437, - "num_tokens": 500256904.0, - "reward": 0.873046875, - "reward_std": 0.3145484924316406, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.81640625, - "rewards/tag_count_reward/std": 0.2564898729324341, + "grad_norm": 6.246214866638184, + "kl": 0.84716796875, + "learning_rate": 9.209276502266102e-07, + "loss": 0.1, + "num_tokens": 547625555.0, + "reward": 1.1171875, + "reward_std": 0.23665377497673035, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.966796875, + "rewards/tag_count_reward/std": 0.12461719661951065, "step": 799 }, { @@ -23186,27 +23186,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 772.85546875, - "completions/mean_terminated_length": 729.0626831054688, - "completions/min_length": 38.0, - "completions/min_terminated_length": 38.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 945.556640625, + "completions/mean_terminated_length": 912.28369140625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, "epoch": 0.27310745071263975, - "grad_norm": 2.3094871044158936, - "kl": 6.1484375, - "learning_rate": 9.20377875849142e-07, - "loss": 0.4261, - "num_tokens": 500728510.0, - "reward": 0.85205078125, - "reward_std": 0.30268627405166626, - "rewards/accuracy_reward/mean": 0.038306452333927155, - "rewards/accuracy_reward/std": 0.19212883710861206, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.81298828125, - "rewards/tag_count_reward/std": 0.2627609074115753, + "grad_norm": 8.196688652038574, + "kl": 1.525390625, + "learning_rate": 9.206237397015267e-07, + "loss": 0.1532, + "num_tokens": 548185584.0, + "reward": 1.07470703125, + "reward_std": 0.2203240543603897, + "rewards/accuracy_reward/mean": 0.11693548411130905, + "rewards/accuracy_reward/std": 0.3216678202152252, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96142578125, + "rewards/tag_count_reward/std": 0.13308516144752502, "step": 800 }, { @@ -23215,27 +23215,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 739.9609375, - "completions/mean_terminated_length": 708.5680541992188, - "completions/min_length": 48.0, - "completions/min_terminated_length": 48.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 986.818359375, + "completions/mean_terminated_length": 959.17236328125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, "epoch": 0.27344883502603057, - "grad_norm": 7.303294658660889, - "kl": 5.828125, - "learning_rate": 9.200731296989862e-07, - "loss": 0.3461, - "num_tokens": 501184426.0, - "reward": 0.88720703125, - "reward_std": 0.32939833402633667, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.81298828125, - "rewards/tag_count_reward/std": 0.2599530518054962, + "grad_norm": 7.788846969604492, + "kl": 1.2353515625, + "learning_rate": 9.203193027446429e-07, + "loss": 0.1359, + "num_tokens": 548767891.0, + "reward": 1.04736328125, + "reward_std": 0.20470289885997772, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95556640625, + "rewards/tag_count_reward/std": 0.1519709825515747, "step": 801 }, { @@ -23244,27 +23244,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 784.236328125, - "completions/mean_terminated_length": 730.1853637695312, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 963.51171875, + "completions/mean_terminated_length": 941.9083862304688, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.2737902193394214, - "grad_norm": 2.827252149581909, - "kl": 5.921875, - "learning_rate": 9.1976785829771e-07, - "loss": 0.3723, - "num_tokens": 501662051.0, - "reward": 0.85205078125, - "reward_std": 0.2874138653278351, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, + "grad_norm": 2.766261339187622, + "kl": 0.98583984375, + "learning_rate": 9.20014339788379e-07, + "loss": 0.0917, + "num_tokens": 549337305.0, + "reward": 1.03564453125, + "reward_std": 0.1824563592672348, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.81884765625, - "rewards/tag_count_reward/std": 0.254165381193161, + "rewards/tag_count_reward/mean": 0.96142578125, + "rewards/tag_count_reward/std": 0.12935683131217957, "step": 802 }, { @@ -23273,27 +23273,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 764.15625, - "completions/mean_terminated_length": 701.016357421875, - "completions/min_length": 41.0, - "completions/min_terminated_length": 41.0, + "completions/max_terminated_length": 1768.0, + "completions/mean_length": 926.216796875, + "completions/mean_terminated_length": 887.6909790039062, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.27413160365281214, - "grad_norm": 2.5225069522857666, - "kl": 6.3671875, - "learning_rate": 9.194620620785905e-07, - "loss": 0.3984, - "num_tokens": 502127811.0, - "reward": 0.8857421875, - "reward_std": 0.29661011695861816, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, + "grad_norm": 4.854599475860596, + "kl": 2.498046875, + "learning_rate": 9.197088512659028e-07, + "loss": 0.2291, + "num_tokens": 549886040.0, + "reward": 1.06689453125, + "reward_std": 0.22579032182693481, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8271484375, - "rewards/tag_count_reward/std": 0.2505478858947754, + "rewards/tag_count_reward/mean": 0.94970703125, + "rewards/tag_count_reward/std": 0.15883907675743103, "step": 803 }, { @@ -23302,27 +23302,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 765.015625, - "completions/mean_terminated_length": 693.591796875, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 918.013671875, + "completions/mean_terminated_length": 890.89404296875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.27447298796620295, - "grad_norm": 4.072855472564697, - "kl": 5.7421875, - "learning_rate": 9.191557414756495e-07, - "loss": 0.4049, - "num_tokens": 502591579.0, - "reward": 0.890625, - "reward_std": 0.33126142621040344, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.8203125, - "rewards/tag_count_reward/std": 0.25687703490257263, + "grad_norm": 5.286581516265869, + "kl": 2.458984375, + "learning_rate": 9.194028376111284e-07, + "loss": 0.2059, + "num_tokens": 550428143.0, + "reward": 1.04736328125, + "reward_std": 0.2311810702085495, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95751953125, + "rewards/tag_count_reward/std": 0.13998566567897797, "step": 804 }, { @@ -23331,27 +23331,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 828.080078125, - "completions/mean_terminated_length": 754.8344116210938, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 1832.0, + "completions/mean_length": 953.55859375, + "completions/mean_terminated_length": 915.9717407226562, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, "epoch": 0.27481437227959377, - "grad_norm": 3.144808292388916, - "kl": 6.0546875, - "learning_rate": 9.188488969236531e-07, - "loss": 0.4617, - "num_tokens": 503088020.0, - "reward": 0.9384765625, - "reward_std": 0.3468049168586731, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.8271484375, - "rewards/tag_count_reward/std": 0.2558613717556, + "grad_norm": 2.703359603881836, + "kl": 2.9765625, + "learning_rate": 9.190962992587157e-07, + "loss": 0.2383, + "num_tokens": 550988829.0, + "reward": 1.10400390625, + "reward_std": 0.22138017416000366, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94384765625, + "rewards/tag_count_reward/std": 0.16447223722934723, "step": 805 }, { @@ -23360,27 +23360,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 761.431640625, - "completions/mean_terminated_length": 703.6672973632812, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 897.677734375, + "completions/mean_terminated_length": 855.76318359375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.2751557565929846, - "grad_norm": 6.194130897521973, - "kl": 5.5703125, - "learning_rate": 9.185415288581105e-07, - "loss": 0.4197, - "num_tokens": 503560961.0, - "reward": 0.875, - "reward_std": 0.2672116756439209, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.845703125, - "rewards/tag_count_reward/std": 0.23641379177570343, + "grad_norm": 8.313536643981934, + "kl": 4.1875, + "learning_rate": 9.187892366440702e-07, + "loss": 0.3229, + "num_tokens": 551531528.0, + "reward": 0.966796875, + "reward_std": 0.2052759975194931, + "rewards/accuracy_reward/mean": 0.029296875, + "rewards/accuracy_reward/std": 0.16880230605602264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.17416280508041382, "step": 806 }, { @@ -23389,27 +23389,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 783.896484375, - "completions/mean_terminated_length": 721.7274169921875, - "completions/min_length": 16.0, - "completions/min_terminated_length": 16.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 917.73828125, + "completions/mean_terminated_length": 866.9918212890625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.27549714090637534, - "grad_norm": 1.6844385862350464, - "kl": 5.296875, - "learning_rate": 9.182336377152753e-07, - "loss": 0.3712, - "num_tokens": 504045100.0, - "reward": 0.8623046875, - "reward_std": 0.2624799311161041, - "rewards/accuracy_reward/mean": 0.017578125, - "rewards/accuracy_reward/std": 0.13154059648513794, + "grad_norm": 15.719550132751465, + "kl": 4.6064453125, + "learning_rate": 9.184816502033417e-07, + "loss": 0.3137, + "num_tokens": 552084194.0, + "reward": 1.01220703125, + "reward_std": 0.21843752264976501, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8447265625, - "rewards/tag_count_reward/std": 0.24493204057216644, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.17813833057880402, "step": 807 }, { @@ -23418,27 +23418,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 769.341796875, - "completions/mean_terminated_length": 720.0628662109375, - "completions/min_length": 8.0, - "completions/min_terminated_length": 8.0, + "completions/max_terminated_length": 1932.0, + "completions/mean_length": 923.197265625, + "completions/mean_terminated_length": 843.1903686523438, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, "epoch": 0.27583852521976615, - "grad_norm": 6.979933261871338, - "kl": 7.3515625, - "learning_rate": 9.179252239321419e-07, - "loss": 0.4488, - "num_tokens": 504514219.0, - "reward": 0.9267578125, - "reward_std": 0.32439467310905457, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.8388671875, - "rewards/tag_count_reward/std": 0.24262726306915283, + "grad_norm": 21.94876480102539, + "kl": 6.796875, + "learning_rate": 9.181735403734241e-07, + "loss": 0.4634, + "num_tokens": 552632087.0, + "reward": 1.0634765625, + "reward_std": 0.2763099670410156, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9189453125, + "rewards/tag_count_reward/std": 0.20034818351268768, "step": 808 }, { @@ -23447,27 +23447,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 801.63671875, - "completions/mean_terminated_length": 756.2227172851562, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 922.220703125, + "completions/mean_terminated_length": 869.2698974609375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.27617990953315696, - "grad_norm": 2.551192045211792, - "kl": 7.0390625, - "learning_rate": 9.176162879464477e-07, - "loss": 0.4806, - "num_tokens": 505010033.0, - "reward": 0.93994140625, - "reward_std": 0.34775638580322266, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.83642578125, - "rewards/tag_count_reward/std": 0.25551894307136536, + "grad_norm": 37.096961975097656, + "kl": 5.74609375, + "learning_rate": 9.17864907591955e-07, + "loss": 0.3865, + "num_tokens": 553189640.0, + "reward": 1.0576171875, + "reward_std": 0.2363978624343872, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.18517960608005524, "step": 809 }, { @@ -23476,27 +23476,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 789.384765625, - "completions/mean_terminated_length": 732.87548828125, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 867.037109375, + "completions/mean_terminated_length": 793.5332641601562, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.2765212938465478, - "grad_norm": 8.129801750183105, - "kl": 8.7734375, - "learning_rate": 9.173068301966707e-07, - "loss": 0.564, - "num_tokens": 505487574.0, - "reward": 0.884765625, - "reward_std": 0.3085659146308899, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.830078125, - "rewards/tag_count_reward/std": 0.2530317008495331, + "grad_norm": 22.411043167114258, + "kl": 5.69140625, + "learning_rate": 9.175557522973146e-07, + "loss": 0.3527, + "num_tokens": 553706939.0, + "reward": 1.00146484375, + "reward_std": 0.23566189408302307, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.19761274755001068, "step": 810 }, { @@ -23505,27 +23505,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 745.357421875, - "completions/mean_terminated_length": 714.0940551757812, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 867.408203125, + "completions/mean_terminated_length": 806.8029174804688, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, "epoch": 0.27686267815993854, - "grad_norm": 2.361401319503784, - "kl": 5.078125, - "learning_rate": 9.169968511220296e-07, - "loss": 0.3309, - "num_tokens": 505946237.0, - "reward": 0.90771484375, - "reward_std": 0.2869418263435364, - "rewards/accuracy_reward/mean": 0.05443548411130905, - "rewards/accuracy_reward/std": 0.227104052901268, + "grad_norm": 26.506681442260742, + "kl": 6.0078125, + "learning_rate": 9.17246074928625e-07, + "loss": 0.3716, + "num_tokens": 554228092.0, + "reward": 1.03173828125, + "reward_std": 0.23587149381637573, + "rewards/accuracy_reward/mean": 0.0927419364452362, + "rewards/accuracy_reward/std": 0.2903633117675781, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.85498046875, - "rewards/tag_count_reward/std": 0.23111572861671448, + "rewards/tag_count_reward/mean": 0.94189453125, + "rewards/tag_count_reward/std": 0.17251938581466675, "step": 811 }, { @@ -23534,27 +23534,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1926.0, - "completions/mean_length": 702.703125, - "completions/mean_terminated_length": 667.6553344726562, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 824.185546875, + "completions/mean_terminated_length": 782.1555786132812, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, "epoch": 0.27720406247332935, - "grad_norm": 4.859031677246094, - "kl": 4.73828125, - "learning_rate": 9.166863511624828e-07, - "loss": 0.3554, - "num_tokens": 506385845.0, - "reward": 0.98681640625, - "reward_std": 0.3073740005493164, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, + "grad_norm": 90.6357192993164, + "kl": 5.96875, + "learning_rate": 9.169358759257508e-07, + "loss": 0.3751, + "num_tokens": 554729899.0, + "reward": 1.1025390625, + "reward_std": 0.25330930948257446, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.88134765625, - "rewards/tag_count_reward/std": 0.2103225141763687, + "rewards/tag_count_reward/mean": 0.9462890625, + "rewards/tag_count_reward/std": 0.16116593778133392, "step": 812 }, { @@ -23563,27 +23563,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 759.708984375, - "completions/mean_terminated_length": 720.8269653320312, - "completions/min_length": 44.0, - "completions/min_terminated_length": 44.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 894.39453125, + "completions/mean_terminated_length": 866.7080688476562, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.27754544678672016, - "grad_norm": 4.14418363571167, - "kl": 3.7109375, - "learning_rate": 9.163753307587285e-07, - "loss": 0.2662, - "num_tokens": 506852368.0, - "reward": 0.90771484375, - "reward_std": 0.27509641647338867, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.87646484375, - "rewards/tag_count_reward/std": 0.23250487446784973, + "grad_norm": 4.809081554412842, + "kl": 3.384765625, + "learning_rate": 9.166251557292959e-07, + "loss": 0.2645, + "num_tokens": 555265381.0, + "reward": 0.994140625, + "reward_std": 0.1926138997077942, + "rewards/accuracy_reward/mean": 0.037109375, + "rewards/accuracy_reward/std": 0.18921469151973724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.1454104632139206, "step": 813 }, { @@ -23592,27 +23592,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 716.064453125, - "completions/mean_terminated_length": 692.2325439453125, - "completions/min_length": 83.0, - "completions/min_terminated_length": 83.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 870.2421875, + "completions/mean_terminated_length": 812.3196411132812, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.277886831100111, - "grad_norm": 8.611891746520996, - "kl": 4.66796875, - "learning_rate": 9.160637903522031e-07, - "loss": 0.3462, - "num_tokens": 507292577.0, - "reward": 0.90771484375, - "reward_std": 0.28996092081069946, - "rewards/accuracy_reward/mean": 0.04233871027827263, - "rewards/accuracy_reward/std": 0.2015640139579773, + "grad_norm": 9.092361450195312, + "kl": 4.45703125, + "learning_rate": 9.163139147806062e-07, + "loss": 0.3152, + "num_tokens": 555784529.0, + "reward": 1.00927734375, + "reward_std": 0.23800216615200043, + "rewards/accuracy_reward/mean": 0.07056451588869095, + "rewards/accuracy_reward/std": 0.25635457038879395, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.86669921875, - "rewards/tag_count_reward/std": 0.2291809767484665, + "rewards/tag_count_reward/mean": 0.94091796875, + "rewards/tag_count_reward/std": 0.17360158264636993, "step": 814 }, { @@ -23621,27 +23621,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1904.0, - "completions/mean_length": 752.1640625, - "completions/mean_terminated_length": 726.3506469726562, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 933.630859375, + "completions/mean_terminated_length": 881.2167358398438, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, "epoch": 0.27822821541350173, - "grad_norm": 2.8570358753204346, - "kl": 4.58984375, - "learning_rate": 9.157517303850814e-07, - "loss": 0.2891, - "num_tokens": 507759973.0, - "reward": 0.93505859375, - "reward_std": 0.27268046140670776, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, + "grad_norm": 7.917178630828857, + "kl": 4.19140625, + "learning_rate": 9.160021535217661e-07, + "loss": 0.2892, + "num_tokens": 556344836.0, + "reward": 1.0244140625, + "reward_std": 0.21468347311019897, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87255859375, - "rewards/tag_count_reward/std": 0.2222767174243927, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.17308135330677032, "step": 815 }, { @@ -23650,27 +23650,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 757.0625, - "completions/mean_terminated_length": 712.727294921875, - "completions/min_length": 93.0, - "completions/min_terminated_length": 93.0, + "completions/max_terminated_length": 1791.0, + "completions/mean_length": 920.28125, + "completions/mean_terminated_length": 874.43896484375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.27856959972689255, - "grad_norm": 6.3649797439575195, - "kl": 6.4140625, - "learning_rate": 9.154391513002754e-07, - "loss": 0.3876, - "num_tokens": 508231173.0, - "reward": 0.8984375, - "reward_std": 0.2914305627346039, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.84765625, - "rewards/tag_count_reward/std": 0.24377650022506714, + "grad_norm": 5.408258438110352, + "kl": 3.021484375, + "learning_rate": 9.156898723955997e-07, + "loss": 0.2441, + "num_tokens": 556899604.0, + "reward": 1.01953125, + "reward_std": 0.1890992522239685, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.16211473941802979, "step": 816 }, { @@ -23679,27 +23679,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1885.0, - "completions/mean_length": 680.474609375, - "completions/mean_terminated_length": 661.5188598632812, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 894.185546875, + "completions/mean_terminated_length": 849.718017578125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.27891098404028336, - "grad_norm": 2.3406496047973633, - "kl": 5.4921875, - "learning_rate": 9.151260535414336e-07, - "loss": 0.3474, - "num_tokens": 508655576.0, - "reward": 0.9384765625, - "reward_std": 0.2707592844963074, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.8759765625, - "rewards/tag_count_reward/std": 0.2127285748720169, + "grad_norm": 13.570917129516602, + "kl": 2.794921875, + "learning_rate": 9.153770718456693e-07, + "loss": 0.2587, + "num_tokens": 557433427.0, + "reward": 1.01953125, + "reward_std": 0.19096723198890686, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.1564028114080429, "step": 817 }, { @@ -23708,27 +23708,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 653.947265625, - "completions/mean_terminated_length": 640.19921875, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 841.42578125, + "completions/mean_terminated_length": 809.9920043945312, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, "epoch": 0.2792523683536742, - "grad_norm": 10.013449668884277, - "kl": 5.6484375, - "learning_rate": 9.148124375529414e-07, - "loss": 0.2774, - "num_tokens": 509063469.0, - "reward": 0.9892578125, - "reward_std": 0.3207172453403473, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.8818359375, - "rewards/tag_count_reward/std": 0.21030718088150024, + "grad_norm": 9.170187950134277, + "kl": 1.875, + "learning_rate": 9.15063752316275e-07, + "loss": 0.1896, + "num_tokens": 557937309.0, + "reward": 1.1328125, + "reward_std": 0.263831228017807, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.14483103156089783, "step": 818 }, { @@ -23737,27 +23737,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1917.0, - "completions/mean_length": 724.18359375, - "completions/mean_terminated_length": 708.4862060546875, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 958.130859375, + "completions/mean_terminated_length": 911.517333984375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, "epoch": 0.27959375266706493, - "grad_norm": 6.756846904754639, - "kl": 5.28125, - "learning_rate": 9.144983037799192e-07, - "loss": 0.2523, - "num_tokens": 509515339.0, - "reward": 0.9443359375, - "reward_std": 0.27607595920562744, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, + "grad_norm": 20.351158142089844, + "kl": 1.849609375, + "learning_rate": 9.14749914252454e-07, + "loss": 0.1799, + "num_tokens": 558508960.0, + "reward": 1.04443359375, + "reward_std": 0.22863608598709106, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8798828125, - "rewards/tag_count_reward/std": 0.20919561386108398, + "rewards/tag_count_reward/mean": 0.95458984375, + "rewards/tag_count_reward/std": 0.15722467005252838, "step": 819 }, { @@ -23766,27 +23766,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1849.0, - "completions/mean_length": 681.810546875, - "completions/mean_terminated_length": 660.1250610351562, - "completions/min_length": 53.0, - "completions/min_terminated_length": 53.0, + "completions/max_terminated_length": 1917.0, + "completions/mean_length": 866.94140625, + "completions/mean_terminated_length": 843.4143676757812, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.27993513698045575, - "grad_norm": 2.1358535289764404, - "kl": 4.38671875, - "learning_rate": 9.141836526682226e-07, - "loss": 0.2262, - "num_tokens": 509936922.0, - "reward": 0.96435546875, - "reward_std": 0.3120565414428711, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.88427734375, - "rewards/tag_count_reward/std": 0.21195176243782043, + "grad_norm": 6.2960662841796875, + "kl": 1.06591796875, + "learning_rate": 9.144355580999798e-07, + "loss": 0.1062, + "num_tokens": 559025330.0, + "reward": 1.083984375, + "reward_std": 0.23968768119812012, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.958984375, + "rewards/tag_count_reward/std": 0.14172318577766418, "step": 820 }, { @@ -23795,27 +23795,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1929.0, - "completions/mean_length": 726.5546875, - "completions/mean_terminated_length": 700.2310791015625, - "completions/min_length": 57.0, - "completions/min_terminated_length": 57.0, + "completions/max_terminated_length": 1792.0, + "completions/mean_length": 873.98828125, + "completions/mean_terminated_length": 843.40283203125, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, "epoch": 0.28027652129384656, - "grad_norm": 1.8945742845535278, - "kl": 4.0, - "learning_rate": 9.138684846644408e-07, - "loss": 0.1925, - "num_tokens": 510383462.0, - "reward": 0.95458984375, - "reward_std": 0.28026509284973145, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, + "grad_norm": 18.691938400268555, + "kl": 1.11962890625, + "learning_rate": 9.141206843053624e-07, + "loss": 0.1445, + "num_tokens": 559547356.0, + "reward": 1.07861328125, + "reward_std": 0.2303360104560852, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.87646484375, - "rewards/tag_count_reward/std": 0.21783897280693054, + "rewards/tag_count_reward/mean": 0.95947265625, + "rewards/tag_count_reward/std": 0.14143213629722595, "step": 821 }, { @@ -23824,27 +23824,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1948.0, - "completions/mean_length": 728.2421875, - "completions/mean_terminated_length": 709.9485473632812, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 915.697265625, + "completions/mean_terminated_length": 864.859130859375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, "epoch": 0.2806179056072374, - "grad_norm": 7.067627906799316, - "kl": 2.88671875, - "learning_rate": 9.135528002158977e-07, - "loss": 0.2135, - "num_tokens": 510837762.0, - "reward": 0.9560546875, - "reward_std": 0.2493462711572647, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, + "grad_norm": 18.163387298583984, + "kl": 2.05078125, + "learning_rate": 9.138052933158466e-07, + "loss": 0.2047, + "num_tokens": 560097633.0, + "reward": 1.03759765625, + "reward_std": 0.21714366972446442, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.19278420507907867, + "rewards/tag_count_reward/mean": 0.94970703125, + "rewards/tag_count_reward/std": 0.16339389979839325, "step": 822 }, { @@ -23853,27 +23853,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1948.0, - "completions/mean_length": 703.02734375, - "completions/mean_terminated_length": 681.6785888671875, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 843.0078125, + "completions/mean_terminated_length": 821.447265625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.28095928992062813, - "grad_norm": 6.042422771453857, - "kl": 1.919921875, - "learning_rate": 9.132365997706493e-07, - "loss": 0.1195, - "num_tokens": 511277152.0, - "reward": 1.02490234375, - "reward_std": 0.23825043439865112, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, + "grad_norm": 10.442327499389648, + "kl": 1.27294921875, + "learning_rate": 9.134893855794118e-07, + "loss": 0.1627, + "num_tokens": 560608693.0, + "reward": 1.07568359375, + "reward_std": 0.2077239453792572, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.17458952963352203, + "rewards/tag_count_reward/mean": 0.95458984375, + "rewards/tag_count_reward/std": 0.14924278855323792, "step": 823 }, { @@ -23882,27 +23882,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 681.3203125, - "completions/mean_terminated_length": 670.55908203125, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/max_terminated_length": 1793.0, + "completions/mean_length": 861.103515625, + "completions/mean_terminated_length": 802.7315063476562, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.28130067423401894, - "grad_norm": 6.865614891052246, - "kl": 2.017578125, - "learning_rate": 9.129198837774846e-07, - "loss": 0.1366, - "num_tokens": 511708660.0, - "reward": 0.994140625, - "reward_std": 0.2854694128036499, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.908203125, - "rewards/tag_count_reward/std": 0.1943957805633545, + "grad_norm": 12.728594779968262, + "kl": 2.93359375, + "learning_rate": 9.131729615447715e-07, + "loss": 0.2667, + "num_tokens": 561132250.0, + "reward": 1.0615234375, + "reward_std": 0.24650120735168457, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.1806451380252838, "step": 824 }, { @@ -23911,27 +23911,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1960.0, - "completions/mean_length": 752.12109375, - "completions/mean_terminated_length": 731.5516357421875, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, - "epoch": 0.28164205854740976, - "grad_norm": 3.810732364654541, - "kl": 2.841796875, - "learning_rate": 9.126026526859236e-07, - "loss": 0.1974, - "num_tokens": 512170514.0, - "reward": 0.97314453125, - "reward_std": 0.2556039094924927, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 867.466796875, + "completions/mean_terminated_length": 816.9755859375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.28164205854740976, + "grad_norm": 8.853020668029785, + "kl": 2.251953125, + "learning_rate": 9.128560216613729e-07, + "loss": 0.2186, + "num_tokens": 561653161.0, + "reward": 1.00927734375, + "reward_std": 0.20880228281021118, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.19274641573429108, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.17980943620204926, "step": 825 }, { @@ -23940,27 +23940,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.001953125, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1886.0, - "completions/mean_length": 694.3203125, - "completions/mean_terminated_length": 691.6712036132812, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 1896.0, + "completions/mean_length": 881.134765625, + "completions/mean_terminated_length": 792.884521484375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.28198344286080057, - "grad_norm": 2.4061853885650635, - "kl": 3.23046875, - "learning_rate": 9.122849069462181e-07, - "loss": 0.1637, - "num_tokens": 512600934.0, - "reward": 0.97998046875, - "reward_std": 0.2752445340156555, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, + "grad_norm": 26.289365768432617, + "kl": 5.01171875, + "learning_rate": 9.125385663793951e-07, + "loss": 0.405, + "num_tokens": 562179230.0, + "reward": 0.97607421875, + "reward_std": 0.24553290009498596, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90576171875, - "rewards/tag_count_reward/std": 0.19542469084262848, + "rewards/tag_count_reward/mean": 0.91162109375, + "rewards/tag_count_reward/std": 0.20542235672473907, "step": 826 }, { @@ -23969,27 +23969,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.00390625, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1625.0, - "completions/mean_length": 655.1875, - "completions/mean_terminated_length": 649.7255249023438, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 822.25, + "completions/mean_terminated_length": 777.5870361328125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.28232482717419133, - "grad_norm": 2.6872496604919434, - "kl": 2.453125, - "learning_rate": 9.119666470093501e-07, - "loss": 0.0745, - "num_tokens": 513029270.0, - "reward": 1.03125, - "reward_std": 0.26225048303604126, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.18489299714565277, + "grad_norm": 7.790221691131592, + "kl": 3.10546875, + "learning_rate": 9.122205961497502e-07, + "loss": 0.2181, + "num_tokens": 562693102.0, + "reward": 1.08203125, + "reward_std": 0.1966366171836853, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.955078125, + "rewards/tag_count_reward/std": 0.15061385929584503, "step": 827 }, { @@ -23998,27 +23998,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.00390625, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 718.95703125, - "completions/mean_terminated_length": 713.7451171875, - "completions/min_length": 20.0, - "completions/min_terminated_length": 20.0, + "completions/max_terminated_length": 1798.0, + "completions/mean_length": 882.1484375, + "completions/mean_terminated_length": 812.1491088867188, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.28266621148758214, - "grad_norm": 2.1618266105651855, - "kl": 3.9921875, - "learning_rate": 9.116478733270312e-07, - "loss": 0.1946, - "num_tokens": 513477536.0, - "reward": 0.9921875, - "reward_std": 0.2947632074356079, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.896484375, - "rewards/tag_count_reward/std": 0.19974872469902039, + "grad_norm": 21.112844467163086, + "kl": 5.59375, + "learning_rate": 9.11902111424081e-07, + "loss": 0.3911, + "num_tokens": 563224922.0, + "reward": 1.03759765625, + "reward_std": 0.28140753507614136, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.20184673368930817, "step": 828 }, { @@ -24027,27 +24027,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.005859375, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 739.02734375, - "completions/mean_terminated_length": 731.3123779296875, - "completions/min_length": 39.0, - "completions/min_terminated_length": 39.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 856.671875, + "completions/mean_terminated_length": 810.7586059570312, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, "epoch": 0.28300759580097296, - "grad_norm": 5.863043308258057, - "kl": 4.46875, - "learning_rate": 9.113285863517024e-07, - "loss": 0.1952, - "num_tokens": 513935646.0, - "reward": 0.9384765625, - "reward_std": 0.26553913950920105, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.8955078125, - "rewards/tag_count_reward/std": 0.2010718286037445, + "grad_norm": 10.879014015197754, + "kl": 3.91015625, + "learning_rate": 9.11583112654761e-07, + "loss": 0.2928, + "num_tokens": 563743266.0, + "reward": 0.9873046875, + "reward_std": 0.24860429763793945, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.18910102546215057, "step": 829 }, { @@ -24056,27 +24056,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.001953125, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1931.0, - "completions/mean_length": 774.67578125, - "completions/mean_terminated_length": 772.1839599609375, - "completions/min_length": 230.0, - "completions/min_terminated_length": 230.0, + "completions/max_terminated_length": 1804.0, + "completions/mean_length": 909.609375, + "completions/mean_terminated_length": 826.0796508789062, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.28334898011436377, - "grad_norm": 1.5997233390808105, - "kl": 3.85546875, - "learning_rate": 9.110087865365333e-07, - "loss": 0.2139, - "num_tokens": 514414040.0, - "reward": 0.96435546875, - "reward_std": 0.23553423583507538, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.1867077797651291, + "grad_norm": 6.0244550704956055, + "kl": 4.24609375, + "learning_rate": 9.112636002948949e-07, + "loss": 0.402, + "num_tokens": 564290746.0, + "reward": 0.9921875, + "reward_std": 0.23310911655426025, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.20437365770339966, "step": 830 }, { @@ -24085,27 +24085,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 815.685546875, - "completions/mean_terminated_length": 781.0421142578125, - "completions/min_length": 58.0, - "completions/min_terminated_length": 58.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 953.25, + "completions/mean_terminated_length": 867.9746704101562, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.2836903644277545, - "grad_norm": 1.3619939088821411, - "kl": 4.3125, - "learning_rate": 9.10688474335421e-07, - "loss": 0.2463, - "num_tokens": 514906375.0, - "reward": 0.93408203125, - "reward_std": 0.2167353332042694, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, + "grad_norm": 7.924810886383057, + "kl": 5.46875, + "learning_rate": 9.109435747983158e-07, + "loss": 0.4281, + "num_tokens": 564853514.0, + "reward": 0.9384765625, + "reward_std": 0.2691951394081116, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.1952681541442871, + "rewards/tag_count_reward/mean": 0.8935546875, + "rewards/tag_count_reward/std": 0.21929316222667694, "step": 831 }, { @@ -24114,27 +24114,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1968.0, - "completions/mean_length": 716.037109375, - "completions/mean_terminated_length": 702.9013671875, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 1906.0, + "completions/mean_length": 839.056640625, + "completions/mean_terminated_length": 779.600341796875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.28403174874114534, - "grad_norm": 3.041804552078247, - "kl": 3.1796875, - "learning_rate": 9.103676502029901e-07, - "loss": 0.216, - "num_tokens": 515353962.0, - "reward": 1.046875, - "reward_std": 0.2764724791049957, - "rewards/accuracy_reward/mean": 0.13750000298023224, - "rewards/accuracy_reward/std": 0.34473371505737305, + "grad_norm": 7.967062950134277, + "kl": 4.35546875, + "learning_rate": 9.106230366195859e-07, + "loss": 0.3342, + "num_tokens": 565364087.0, + "reward": 1.08642578125, + "reward_std": 0.2985227108001709, + "rewards/accuracy_reward/mean": 0.17291666567325592, + "rewards/accuracy_reward/std": 0.3785697817802429, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.1813446581363678, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.192215234041214, "step": 832 }, { @@ -24143,27 +24143,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1927.0, - "completions/mean_length": 773.44140625, - "completions/mean_terminated_length": 755.7742919921875, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 908.55859375, + "completions/mean_terminated_length": 862.2398071289062, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.28437313305453615, - "grad_norm": 1.7975035905838013, - "kl": 3.40625, - "learning_rate": 9.100463145945921e-07, - "loss": 0.1994, - "num_tokens": 515842252.0, - "reward": 0.9501953125, - "reward_std": 0.22530975937843323, - "rewards/accuracy_reward/mean": 0.0463709682226181, - "rewards/accuracy_reward/std": 0.21049949526786804, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.9033203125, - "rewards/tag_count_reward/std": 0.1951684206724167, + "grad_norm": 4.41445255279541, + "kl": 4.25, + "learning_rate": 9.103019862139961e-07, + "loss": 0.3405, + "num_tokens": 565921557.0, + "reward": 0.97705078125, + "reward_std": 0.24522006511688232, + "rewards/accuracy_reward/mean": 0.05443548411130905, + "rewards/accuracy_reward/std": 0.2271040678024292, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.19724006950855255, "step": 833 }, { @@ -24172,27 +24172,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.005859375, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 715.888671875, - "completions/mean_terminated_length": 708.037353515625, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 833.853515625, + "completions/mean_terminated_length": 771.5256958007812, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, "epoch": 0.28471451736792697, - "grad_norm": 3.068521738052368, - "kl": 3.193359375, - "learning_rate": 9.097244679663037e-07, - "loss": 0.2008, - "num_tokens": 516292371.0, - "reward": 1.033203125, - "reward_std": 0.2703208327293396, + "grad_norm": 4.675474643707275, + "kl": 4.44140625, + "learning_rate": 9.099804240375643e-07, + "loss": 0.3594, + "num_tokens": 566432074.0, + "reward": 1.04931640625, + "reward_std": 0.26661860942840576, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.18268859386444092, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.17949029803276062, "step": 834 }, { @@ -24201,27 +24201,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 718.63671875, - "completions/mean_terminated_length": 705.526611328125, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 907.27734375, + "completions/mean_terminated_length": 831.2291870117188, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.2850559016813177, - "grad_norm": 2.360788345336914, - "kl": 4.5390625, - "learning_rate": 9.094021107749277e-07, - "loss": 0.2485, - "num_tokens": 516736937.0, + "grad_norm": 4.916335105895996, + "kl": 4.91796875, + "learning_rate": 9.096583505470359e-07, + "loss": 0.3797, + "num_tokens": 566973224.0, "reward": 1.04931640625, - "reward_std": 0.3307260274887085, - "rewards/accuracy_reward/mean": 0.146484375, - "rewards/accuracy_reward/std": 0.35393697023391724, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.90087890625, - "rewards/tag_count_reward/std": 0.20646168291568756, + "reward_std": 0.303281307220459, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91064453125, + "rewards/tag_count_reward/std": 0.21203739941120148, "step": 835 }, { @@ -24230,27 +24230,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1888.0, - "completions/mean_length": 684.75390625, - "completions/mean_terminated_length": 671.3096923828125, - "completions/min_length": 61.0, - "completions/min_terminated_length": 61.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 847.298828125, + "completions/mean_terminated_length": 795.945068359375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.28539728599470854, - "grad_norm": 3.68979549407959, - "kl": 4.734375, - "learning_rate": 9.090792434779911e-07, - "loss": 0.2605, - "num_tokens": 517166475.0, - "reward": 1.048828125, - "reward_std": 0.32015591859817505, - "rewards/accuracy_reward/mean": 0.138671875, - "rewards/accuracy_reward/std": 0.34594178199768066, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.19345958530902863, + "grad_norm": 3.872030258178711, + "kl": 3.265625, + "learning_rate": 9.093357661998817e-07, + "loss": 0.266, + "num_tokens": 567485985.0, + "reward": 1.09716796875, + "reward_std": 0.2875131070613861, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.18053600192070007, "step": 836 }, { @@ -24259,27 +24259,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 767.984375, - "completions/mean_terminated_length": 745.0814819335938, - "completions/min_length": 71.0, - "completions/min_terminated_length": 71.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 919.724609375, + "completions/mean_terminated_length": 864.235595703125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, "epoch": 0.28573867030809935, - "grad_norm": 3.829807996749878, - "kl": 5.890625, - "learning_rate": 9.087558665337447e-07, - "loss": 0.3743, - "num_tokens": 517634995.0, - "reward": 0.99169921875, - "reward_std": 0.3004174530506134, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.88623046875, - "rewards/tag_count_reward/std": 0.21811513602733612, + "grad_norm": 9.094332695007324, + "kl": 4.57421875, + "learning_rate": 9.090126714542989e-07, + "loss": 0.3969, + "num_tokens": 568032196.0, + "reward": 1.04443359375, + "reward_std": 0.28874391317367554, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.2054828256368637, "step": 837 }, { @@ -24288,27 +24288,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 752.056640625, - "completions/mean_terminated_length": 728.8687744140625, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 956.232421875, + "completions/mean_terminated_length": 890.68115234375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, "epoch": 0.28608005462149017, - "grad_norm": 3.1375844478607178, - "kl": 3.765625, - "learning_rate": 9.084319804011631e-07, - "loss": 0.2343, - "num_tokens": 518098624.0, - "reward": 0.9423828125, - "reward_std": 0.2559373080730438, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, + "grad_norm": 11.085018157958984, + "kl": 3.46875, + "learning_rate": 9.086890667692094e-07, + "loss": 0.3482, + "num_tokens": 568600363.0, + "reward": 1.0087890625, + "reward_std": 0.290497362613678, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9013671875, - "rewards/tag_count_reward/std": 0.20401567220687866, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.21367871761322021, "step": 838 }, { @@ -24317,27 +24317,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 757.591796875, - "completions/mean_terminated_length": 726.6220092773438, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 1887.0, + "completions/mean_length": 937.859375, + "completions/mean_terminated_length": 868.7635498046875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.2864214389348809, - "grad_norm": 2.0383784770965576, - "kl": 4.453125, - "learning_rate": 9.081075855399434e-07, - "loss": 0.2651, - "num_tokens": 518557599.0, - "reward": 1.0068359375, - "reward_std": 0.31170833110809326, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.2115394026041031, + "grad_norm": 8.503934860229492, + "kl": 3.41796875, + "learning_rate": 9.083649526042594e-07, + "loss": 0.2905, + "num_tokens": 569151635.0, + "reward": 1.07373046875, + "reward_std": 0.28665444254875183, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.20582664012908936, "step": 839 }, { @@ -24346,27 +24346,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 701.892578125, - "completions/mean_terminated_length": 685.9308471679688, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 903.974609375, + "completions/mean_terminated_length": 801.7425537109375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.28676282324827174, - "grad_norm": 3.5112719535827637, - "kl": 3.80078125, - "learning_rate": 9.077826824105049e-07, - "loss": 0.2563, - "num_tokens": 518990744.0, - "reward": 0.96142578125, - "reward_std": 0.2386517971754074, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.18033477663993835, + "grad_norm": 5.045845985412598, + "kl": 5.7265625, + "learning_rate": 9.080403294198188e-07, + "loss": 0.4764, + "num_tokens": 569688246.0, + "reward": 0.951171875, + "reward_std": 0.2592836916446686, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.904296875, + "rewards/tag_count_reward/std": 0.22090166807174683, "step": 840 }, { @@ -24375,27 +24375,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1908.0, - "completions/mean_length": 749.23828125, - "completions/mean_terminated_length": 715.40283203125, - "completions/min_length": 205.0, - "completions/min_terminated_length": 205.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 957.3828125, + "completions/mean_terminated_length": 859.9234008789062, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, "epoch": 0.28710420756166255, - "grad_norm": 2.8488667011260986, - "kl": 4.8046875, - "learning_rate": 9.074572714739881e-07, - "loss": 0.3292, - "num_tokens": 519449026.0, - "reward": 0.94873046875, - "reward_std": 0.24731914699077606, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.19474875926971436, + "grad_norm": 4.079229354858398, + "kl": 5.2734375, + "learning_rate": 9.077151976769803e-07, + "loss": 0.4387, + "num_tokens": 570253098.0, + "reward": 0.97802734375, + "reward_std": 0.30761438608169556, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88818359375, + "rewards/tag_count_reward/std": 0.2373451143503189, "step": 841 }, { @@ -24404,27 +24404,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 689.671875, - "completions/mean_terminated_length": 657.072021484375, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 883.060546875, + "completions/mean_terminated_length": 776.2537231445312, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, "epoch": 0.28744559187505336, - "grad_norm": 2.858069896697998, - "kl": 5.765625, - "learning_rate": 9.071313531922541e-07, - "loss": 0.3456, - "num_tokens": 519869770.0, - "reward": 1.013671875, - "reward_std": 0.328676700592041, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.8828125, - "rewards/tag_count_reward/std": 0.21714095771312714, + "grad_norm": 6.058124542236328, + "kl": 6.34765625, + "learning_rate": 9.073895578375593e-07, + "loss": 0.516, + "num_tokens": 570772857.0, + "reward": 1.0361328125, + "reward_std": 0.31683483719825745, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8955078125, + "rewards/tag_count_reward/std": 0.2310728281736374, "step": 842 }, { @@ -24433,27 +24433,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.130859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 757.505859375, - "completions/mean_terminated_length": 729.171630859375, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1010.8984375, + "completions/mean_terminated_length": 854.7505493164062, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.2877869761884441, - "grad_norm": 1.8919070959091187, - "kl": 5.265625, - "learning_rate": 9.068049280278846e-07, - "loss": 0.3371, - "num_tokens": 520338189.0, - "reward": 0.9970703125, - "reward_std": 0.27113696932792664, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.9013671875, - "rewards/tag_count_reward/std": 0.20639976859092712, + "grad_norm": 12.013524055480957, + "kl": 7.5234375, + "learning_rate": 9.070634103640927e-07, + "loss": 0.5451, + "num_tokens": 571371013.0, + "reward": 0.98583984375, + "reward_std": 0.305656760931015, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86083984375, + "rewards/tag_count_reward/std": 0.2659669816493988, "step": 843 }, { @@ -24462,27 +24462,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 808.9765625, - "completions/mean_terminated_length": 763.8299560546875, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 982.544921875, + "completions/mean_terminated_length": 835.7489013671875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.28812836050183493, - "grad_norm": 3.263883590698242, - "kl": 6.1015625, - "learning_rate": 9.064779964441802e-07, - "loss": 0.3948, - "num_tokens": 520829713.0, - "reward": 0.99609375, - "reward_std": 0.31634271144866943, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.884765625, - "rewards/tag_count_reward/std": 0.22426731884479523, + "grad_norm": 8.356927871704102, + "kl": 6.375, + "learning_rate": 9.067367557198384e-07, + "loss": 0.5076, + "num_tokens": 571951404.0, + "reward": 0.99658203125, + "reward_std": 0.3164166808128357, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88720703125, + "rewards/tag_count_reward/std": 0.23791208863258362, "step": 844 }, { @@ -24491,27 +24491,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.12890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1881.0, - "completions/mean_length": 673.951171875, - "completions/mean_terminated_length": 657.6581420898438, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 1870.0, + "completions/mean_length": 941.095703125, + "completions/mean_terminated_length": 777.2937622070312, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.28846974481522575, - "grad_norm": 2.096222400665283, - "kl": 3.875, - "learning_rate": 9.061505589051606e-07, - "loss": 0.2581, - "num_tokens": 521249912.0, - "reward": 0.99169921875, - "reward_std": 0.24171248078346252, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 4.825211048126221, + "kl": 6.66015625, + "learning_rate": 9.064095943687747e-07, + "loss": 0.5743, + "num_tokens": 572508381.0, + "reward": 0.95849609375, + "reward_std": 0.30790483951568604, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.17035679519176483, + "rewards/tag_count_reward/mean": 0.87451171875, + "rewards/tag_count_reward/std": 0.2555638253688812, "step": 845 }, { @@ -24520,27 +24520,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 737.296875, - "completions/mean_terminated_length": 716.4921264648438, - "completions/min_length": 184.0, - "completions/min_terminated_length": 184.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 888.830078125, + "completions/mean_terminated_length": 782.55224609375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.28881112912861656, - "grad_norm": 4.202604293823242, - "kl": 4.58203125, - "learning_rate": 9.058226158755634e-07, - "loss": 0.3379, - "num_tokens": 521700464.0, - "reward": 0.990234375, - "reward_std": 0.2905173897743225, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, + "grad_norm": 4.776530742645264, + "kl": 4.6328125, + "learning_rate": 9.060819267755999e-07, + "loss": 0.4377, + "num_tokens": 573036518.0, + "reward": 1.01708984375, + "reward_std": 0.27943456172943115, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.904296875, - "rewards/tag_count_reward/std": 0.20240987837314606, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.21231211721897125, "step": 846 }, { @@ -24549,27 +24549,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.11328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 783.7734375, - "completions/mean_terminated_length": 756.0159912109375, - "completions/min_length": 63.0, - "completions/min_terminated_length": 63.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 982.671875, + "completions/mean_terminated_length": 846.5726318359375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.2891525134420073, - "grad_norm": 1.9926021099090576, - "kl": 5.2109375, - "learning_rate": 9.05494167820844e-07, - "loss": 0.364, - "num_tokens": 522177132.0, - "reward": 0.98291015625, - "reward_std": 0.28996962308883667, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.89501953125, - "rewards/tag_count_reward/std": 0.21004518866539001, + "grad_norm": 8.882272720336914, + "kl": 6.40625, + "learning_rate": 9.057537534057311e-07, + "loss": 0.4843, + "num_tokens": 573615022.0, + "reward": 0.9814453125, + "reward_std": 0.32508599758148193, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8857421875, + "rewards/tag_count_reward/std": 0.24457286298274994, "step": 847 }, { @@ -24578,27 +24578,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 751.43359375, - "completions/mean_terminated_length": 730.8532104492188, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 991.978515625, + "completions/mean_terminated_length": 864.88623046875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.28949389775539813, - "grad_norm": 2.8588552474975586, - "kl": 4.7890625, - "learning_rate": 9.05165215207174e-07, - "loss": 0.2968, - "num_tokens": 522645802.0, - "reward": 0.966796875, - "reward_std": 0.2664114832878113, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, + "grad_norm": 5.451161861419678, + "kl": 5.96875, + "learning_rate": 9.054250747253037e-07, + "loss": 0.481, + "num_tokens": 574206851.0, + "reward": 0.96240234375, + "reward_std": 0.3031417727470398, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.19155354797840118, + "rewards/tag_count_reward/mean": 0.88818359375, + "rewards/tag_count_reward/std": 0.24244357645511627, "step": 848 }, { @@ -24607,27 +24607,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1965.0, - "completions/mean_length": 723.662109375, - "completions/mean_terminated_length": 702.6409301757812, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 927.46484375, + "completions/mean_terminated_length": 822.1154174804688, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.28983528206878895, - "grad_norm": 2.4607505798339844, - "kl": 5.546875, - "learning_rate": 9.048357585014417e-07, - "loss": 0.3355, - "num_tokens": 523092253.0, - "reward": 1.01220703125, - "reward_std": 0.2797544598579407, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.19333051145076752, + "grad_norm": 6.084855079650879, + "kl": 5.796875, + "learning_rate": 9.05095891201171e-07, + "loss": 0.451, + "num_tokens": 574757649.0, + "reward": 1.0439453125, + "reward_std": 0.2905218005180359, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9091796875, + "rewards/tag_count_reward/std": 0.21570825576782227, "step": 849 }, { @@ -24636,56 +24636,56 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 716.376953125, - "completions/mean_terminated_length": 689.8506469726562, - "completions/min_length": 43.0, - "completions/min_terminated_length": 43.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 937.080078125, + "completions/mean_terminated_length": 830.0321044921875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.29017666638217976, - "grad_norm": 9.725028038024902, - "kl": 5.46875, - "learning_rate": 9.045057981712504e-07, - "loss": 0.293, - "num_tokens": 523538302.0, - "reward": 1.00537109375, - "reward_std": 0.3139130473136902, - "rewards/accuracy_reward/mean": 0.0927419364452362, - "rewards/accuracy_reward/std": 0.2903633117675781, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.20216365158557892, - "step": 850 - }, - { - "clip_ratio/high_max": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, + "grad_norm": 5.533233642578125, + "kl": 6.21875, + "learning_rate": 9.047662033009035e-07, + "loss": 0.4752, + "num_tokens": 575316698.0, + "reward": 1.02685546875, + "reward_std": 0.3341251611709595, + "rewards/accuracy_reward/mean": 0.13104838132858276, + "rewards/accuracy_reward/std": 0.3377939760684967, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89990234375, + "rewards/tag_count_reward/std": 0.23691797256469727, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 727.6640625, - "completions/mean_terminated_length": 701.362548828125, - "completions/min_length": 77.0, - "completions/min_terminated_length": 77.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 924.703125, + "completions/mean_terminated_length": 824.3233642578125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.2905180506955705, - "grad_norm": 2.414006233215332, - "kl": 5.2578125, - "learning_rate": 9.041753346849187e-07, - "loss": 0.3348, - "num_tokens": 523989938.0, - "reward": 0.94482421875, - "reward_std": 0.2373579442501068, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.90576171875, - "rewards/tag_count_reward/std": 0.19729334115982056, + "grad_norm": 7.086710453033447, + "kl": 5.3515625, + "learning_rate": 9.044360114927879e-07, + "loss": 0.425, + "num_tokens": 575869218.0, + "reward": 0.95849609375, + "reward_std": 0.23783710598945618, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91162109375, + "rewards/tag_count_reward/std": 0.21530643105506897, "step": 851 }, { @@ -24694,27 +24694,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 717.421875, - "completions/mean_terminated_length": 682.7575073242188, - "completions/min_length": 202.0, - "completions/min_terminated_length": 202.0, + "completions/max_terminated_length": 1688.0, + "completions/mean_length": 946.5703125, + "completions/mean_terminated_length": 837.8455200195312, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, "epoch": 0.29085943500896133, - "grad_norm": 5.3849053382873535, - "kl": 4.13671875, - "learning_rate": 9.038443685114791e-07, - "loss": 0.3105, - "num_tokens": 524439306.0, - "reward": 1.01953125, - "reward_std": 0.2760043144226074, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.18053138256072998, + "grad_norm": 4.266741752624512, + "kl": 5.33984375, + "learning_rate": 9.041053162458265e-07, + "loss": 0.4014, + "num_tokens": 576435910.0, + "reward": 1.0234375, + "reward_std": 0.30263999104499817, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.2266491949558258, "step": 852 }, { @@ -24723,27 +24723,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1967.0, - "completions/mean_length": 720.787109375, - "completions/mean_terminated_length": 699.7202758789062, - "completions/min_length": 84.0, - "completions/min_terminated_length": 84.0, + "completions/max_terminated_length": 1718.0, + "completions/mean_length": 930.234375, + "completions/mean_terminated_length": 825.1453247070312, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.29120081932235214, - "grad_norm": 4.139332294464111, - "kl": 3.68359375, - "learning_rate": 9.035129001206771e-07, - "loss": 0.2529, - "num_tokens": 524883133.0, - "reward": 1.017578125, - "reward_std": 0.2673831582069397, - "rewards/accuracy_reward/mean": 0.10483870655298233, - "rewards/accuracy_reward/std": 0.30665475130081177, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.18882036209106445, + "grad_norm": 15.044821739196777, + "kl": 4.53515625, + "learning_rate": 9.037741180297375e-07, + "loss": 0.4134, + "num_tokens": 576986974.0, + "reward": 1.0517578125, + "reward_std": 0.2825441360473633, + "rewards/accuracy_reward/mean": 0.13709677755832672, + "rewards/accuracy_reward/std": 0.34429675340652466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9189453125, + "rewards/tag_count_reward/std": 0.21105100214481354, "step": 853 }, { @@ -24752,27 +24752,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1906.0, - "completions/mean_length": 711.49609375, - "completions/mean_terminated_length": 682.1516723632812, - "completions/min_length": 10.0, - "completions/min_terminated_length": 10.0, + "completions/max_terminated_length": 1834.0, + "completions/mean_length": 915.306640625, + "completions/mean_terminated_length": 821.913330078125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, "epoch": 0.29154220363574296, - "grad_norm": 3.6715269088745117, - "kl": 3.7109375, - "learning_rate": 9.03180929982972e-07, - "loss": 0.2463, - "num_tokens": 525314395.0, - "reward": 0.9794921875, - "reward_std": 0.2377520203590393, - "rewards/accuracy_reward/mean": 0.06451612710952759, - "rewards/accuracy_reward/std": 0.2459181249141693, + "grad_norm": 10.788235664367676, + "kl": 5.875, + "learning_rate": 9.034424173149522e-07, + "loss": 0.4766, + "num_tokens": 577522587.0, + "reward": 1.02783203125, + "reward_std": 0.28484830260276794, + "rewards/accuracy_reward/mean": 0.11491935700178146, + "rewards/accuracy_reward/std": 0.3192465901374817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.1849110871553421, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.2189328372478485, "step": 854 }, { @@ -24781,27 +24781,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1942.0, - "completions/mean_length": 730.3515625, - "completions/mean_terminated_length": 717.3569946289062, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/max_terminated_length": 1789.0, + "completions/mean_length": 956.994140625, + "completions/mean_terminated_length": 854.4209594726562, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, "epoch": 0.2918835879491337, - "grad_norm": 1.6833791732788086, - "kl": 4.6171875, - "learning_rate": 9.028484585695345e-07, - "loss": 0.3114, - "num_tokens": 525762831.0, - "reward": 0.9453125, - "reward_std": 0.23490703105926514, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, + "grad_norm": 39.10015106201172, + "kl": 8.21875, + "learning_rate": 9.031102145726168e-07, + "loss": 0.5337, + "num_tokens": 578087064.0, + "reward": 0.9736328125, + "reward_std": 0.28830575942993164, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91015625, - "rewards/tag_count_reward/std": 0.19151362776756287, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.226512148976326, "step": 855 }, { @@ -24810,27 +24810,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 735.22265625, - "completions/mean_terminated_length": 724.8858032226562, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1003.05078125, + "completions/mean_terminated_length": 914.4957885742188, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.29222497226252453, - "grad_norm": 4.737995624542236, - "kl": 3.63671875, - "learning_rate": 9.025154863522467e-07, - "loss": 0.1956, - "num_tokens": 526217937.0, - "reward": 1.00146484375, - "reward_std": 0.2507244944572449, - "rewards/accuracy_reward/mean": 0.0947580635547638, - "rewards/accuracy_reward/std": 0.29317617416381836, + "grad_norm": 8.92786693572998, + "kl": 5.12109375, + "learning_rate": 9.027775102745899e-07, + "loss": 0.4159, + "num_tokens": 578679298.0, + "reward": 1.0390625, + "reward_std": 0.2574598789215088, + "rewards/accuracy_reward/mean": 0.12903225421905518, + "rewards/accuracy_reward/std": 0.33557409048080444, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.1890321522951126, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.21770349144935608, "step": 856 }, { @@ -24839,27 +24839,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 770.6171875, - "completions/mean_terminated_length": 724.0728759765625, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 1022.564453125, + "completions/mean_terminated_length": 923.7537231445312, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, "epoch": 0.29256635657591534, - "grad_norm": 5.21553373336792, - "kl": 5.40625, - "learning_rate": 9.021820138037022e-07, - "loss": 0.3029, - "num_tokens": 526697133.0, - "reward": 1.0234375, - "reward_std": 0.3139427900314331, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, + "grad_norm": 86.81981658935547, + "kl": 7.6875, + "learning_rate": 9.02444304893443e-07, + "loss": 0.5075, + "num_tokens": 579287491.0, + "reward": 1.0546875, + "reward_std": 0.3531006872653961, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.892578125, - "rewards/tag_count_reward/std": 0.19828914105892181, + "rewards/tag_count_reward/mean": 0.904296875, + "rewards/tag_count_reward/std": 0.22637078166007996, "step": 857 }, { @@ -24868,27 +24868,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 715.330078125, - "completions/mean_terminated_length": 694.1766357421875, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 959.220703125, + "completions/mean_terminated_length": 884.2108764648438, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.29290774088930616, - "grad_norm": 1.9480489492416382, - "kl": 4.60546875, - "learning_rate": 9.01848041397204e-07, - "loss": 0.2503, - "num_tokens": 527136118.0, - "reward": 0.962890625, - "reward_std": 0.24685801565647125, - "rewards/accuracy_reward/mean": 0.058467742055654526, - "rewards/accuracy_reward/std": 0.23486268520355225, + "grad_norm": 17.985790252685547, + "kl": 4.94921875, + "learning_rate": 9.021105989024589e-07, + "loss": 0.4232, + "num_tokens": 579851348.0, + "reward": 1.01416015625, + "reward_std": 0.2654188275337219, + "rewards/accuracy_reward/mean": 0.0927419364452362, + "rewards/accuracy_reward/std": 0.2903633117675781, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.2009030431509018, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.20214001834392548, "step": 858 }, { @@ -24897,27 +24897,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1904.0, - "completions/mean_length": 706.712890625, - "completions/mean_terminated_length": 690.808349609375, - "completions/min_length": 8.0, - "completions/min_terminated_length": 8.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 970.20703125, + "completions/mean_terminated_length": 843.1310424804688, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, "epoch": 0.2932491252026969, - "grad_norm": 1.5688979625701904, - "kl": 3.5625, - "learning_rate": 9.015135696067649e-07, - "loss": 0.1984, - "num_tokens": 527576643.0, - "reward": 0.97412109375, - "reward_std": 0.21969881653785706, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, + "grad_norm": 16.842567443847656, + "kl": 4.84375, + "learning_rate": 9.017763927756317e-07, + "loss": 0.4549, + "num_tokens": 580426782.0, + "reward": 0.96630859375, + "reward_std": 0.2677017152309418, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.18282388150691986, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.2359561175107956, "step": 859 }, { @@ -24926,27 +24926,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.09765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 730.560546875, - "completions/mean_terminated_length": 717.5680541992188, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 1877.0, + "completions/mean_length": 993.328125, + "completions/mean_terminated_length": 879.1861572265625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, "epoch": 0.2935905095160877, - "grad_norm": 1.75925874710083, - "kl": 2.9765625, - "learning_rate": 9.011785989071066e-07, - "loss": 0.1618, - "num_tokens": 528023202.0, - "reward": 1.01318359375, - "reward_std": 0.25561508536338806, - "rewards/accuracy_reward/mean": 0.09072580933570862, - "rewards/accuracy_reward/std": 0.2875087857246399, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.17305581271648407, + "grad_norm": 27.68691635131836, + "kl": 4.08203125, + "learning_rate": 9.014416869876658e-07, + "loss": 0.4259, + "num_tokens": 581007878.0, + "reward": 1.0302734375, + "reward_std": 0.30928128957748413, + "rewards/accuracy_reward/mean": 0.11895161122083664, + "rewards/accuracy_reward/std": 0.3240584135055542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.22307756543159485, "step": 860 }, { @@ -24955,27 +24955,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 729.85546875, - "completions/mean_terminated_length": 708.9325561523438, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_terminated_length": 1851.0, + "completions/mean_length": 977.7578125, + "completions/mean_terminated_length": 859.35791015625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.29393189382947854, - "grad_norm": 3.2166786193847656, - "kl": 3.55859375, - "learning_rate": 9.008431297736585e-07, - "loss": 0.2405, - "num_tokens": 528467288.0, - "reward": 1.02685546875, - "reward_std": 0.28205960988998413, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.1840168684720993, + "grad_norm": 23.26854133605957, + "kl": 3.01171875, + "learning_rate": 9.011064820139756e-07, + "loss": 0.382, + "num_tokens": 581578890.0, + "reward": 1.05078125, + "reward_std": 0.3200468122959137, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.22280539572238922, "step": 861 }, { @@ -24984,27 +24984,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1915.0, - "completions/mean_length": 709.005859375, - "completions/mean_terminated_length": 698.4625854492188, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/max_terminated_length": 1893.0, + "completions/mean_length": 962.869140625, + "completions/mean_terminated_length": 842.8221435546875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.29427327814286935, - "grad_norm": 3.0324487686157227, - "kl": 2.662109375, - "learning_rate": 9.005071626825577e-07, - "loss": 0.1677, - "num_tokens": 528906331.0, - "reward": 1.02099609375, - "reward_std": 0.25581175088882446, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.16831976175308228, + "grad_norm": 18.85279083251953, + "kl": 3.33203125, + "learning_rate": 9.007707783306837e-07, + "loss": 0.3773, + "num_tokens": 582147911.0, + "reward": 1.044921875, + "reward_std": 0.31132763624191284, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.22845487296581268, "step": 862 }, { @@ -25013,27 +25013,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1963.0, - "completions/mean_length": 723.283203125, - "completions/mean_terminated_length": 707.5751342773438, - "completions/min_length": 59.0, - "completions/min_terminated_length": 59.0, + "completions/max_terminated_length": 1932.0, + "completions/mean_length": 952.11328125, + "completions/mean_terminated_length": 856.7176513671875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.2946146624562601, - "grad_norm": 2.2073960304260254, - "kl": 4.109375, - "learning_rate": 9.001706981106482e-07, - "loss": 0.2688, - "num_tokens": 529349116.0, - "reward": 1.0986328125, - "reward_std": 0.3106450140476227, + "grad_norm": 17.94124984741211, + "kl": 2.404296875, + "learning_rate": 9.004345764146223e-07, + "loss": 0.2795, + "num_tokens": 582707857.0, + "reward": 1.08544921875, + "reward_std": 0.2995716333389282, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.9287109375, - "rewards/tag_count_reward/std": 0.17074689269065857, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.21439915895462036, "step": 863 }, { @@ -25042,27 +25042,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.005859375, + "completions/clipped_ratio": 0.095703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1884.0, - "completions/mean_length": 731.982421875, - "completions/mean_terminated_length": 724.2259521484375, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 978.275390625, + "completions/mean_terminated_length": 865.0647583007812, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.2949560467696509, - "grad_norm": 1.7147998809814453, - "kl": 4.4140625, - "learning_rate": 8.998337365354798e-07, - "loss": 0.2635, - "num_tokens": 529803779.0, - "reward": 1.00927734375, - "reward_std": 0.27351608872413635, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.17359058558940887, + "grad_norm": 28.643335342407227, + "kl": 3.513671875, + "learning_rate": 9.000978767433303e-07, + "loss": 0.405, + "num_tokens": 583288622.0, + "reward": 1.03759765625, + "reward_std": 0.31883561611175537, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.22240565717220306, "step": 864 }, { @@ -25071,27 +25071,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1800.0, - "completions/mean_length": 750.14453125, - "completions/mean_terminated_length": 729.543701171875, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 965.3984375, + "completions/mean_terminated_length": 883.5210571289062, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, "epoch": 0.29529743108304174, - "grad_norm": 4.473459243774414, - "kl": 4.53125, - "learning_rate": 8.994962784353079e-07, - "loss": 0.2475, - "num_tokens": 530265709.0, - "reward": 0.9765625, - "reward_std": 0.24417805671691895, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.19282633066177368, + "grad_norm": 20.81723403930664, + "kl": 2.892578125, + "learning_rate": 8.99760679795054e-07, + "loss": 0.3238, + "num_tokens": 583860762.0, + "reward": 1.02490234375, + "reward_std": 0.2676134705543518, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92919921875, + "rewards/tag_count_reward/std": 0.20270176231861115, "step": 865 }, { @@ -25100,27 +25100,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.103515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 720.380859375, - "completions/mean_terminated_length": 707.2879638671875, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 1783.0, + "completions/mean_length": 960.0859375, + "completions/mean_terminated_length": 834.4661865234375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, "epoch": 0.29563881539643255, - "grad_norm": 3.271315336227417, - "kl": 4.3828125, - "learning_rate": 8.991583242890924e-07, - "loss": 0.2248, - "num_tokens": 530709632.0, - "reward": 1.00537109375, - "reward_std": 0.25619742274284363, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.17805784940719604, + "grad_norm": 13.378107070922852, + "kl": 5.83984375, + "learning_rate": 8.994229860487461e-07, + "loss": 0.4686, + "num_tokens": 584427414.0, + "reward": 1.00244140625, + "reward_std": 0.27250322699546814, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90673828125, + "rewards/tag_count_reward/std": 0.22658121585845947, "step": 866 }, { @@ -25129,27 +25129,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.103515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1905.0, - "completions/mean_length": 748.685546875, - "completions/mean_terminated_length": 733.2786865234375, - "completions/min_length": 74.0, - "completions/min_terminated_length": 74.0, + "completions/max_terminated_length": 1558.0, + "completions/mean_length": 963.87890625, + "completions/mean_terminated_length": 838.6971435546875, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, "epoch": 0.2959801997098233, - "grad_norm": 3.136951208114624, - "kl": 4.26953125, - "learning_rate": 8.988198745764976e-07, - "loss": 0.2058, - "num_tokens": 531177903.0, - "reward": 0.96044921875, - "reward_std": 0.26861572265625, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.20692861080169678, + "grad_norm": 50.89612579345703, + "kl": 7.640625, + "learning_rate": 8.990847959840646e-07, + "loss": 0.5174, + "num_tokens": 585005864.0, + "reward": 0.982421875, + "reward_std": 0.2623264193534851, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.22488853335380554, "step": 867 }, { @@ -25158,27 +25158,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.134765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 749.0078125, - "completions/mean_terminated_length": 723.1314697265625, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 1023.705078125, + "completions/mean_terminated_length": 864.164794921875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.2963215840232141, - "grad_norm": 3.2323830127716064, - "kl": 2.6015625, - "learning_rate": 8.984809297778908e-07, - "loss": 0.1476, - "num_tokens": 531646547.0, - "reward": 0.95751953125, - "reward_std": 0.2431613653898239, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.90869140625, - "rewards/tag_count_reward/std": 0.19177725911140442, + "grad_norm": 27.698413848876953, + "kl": 7.6796875, + "learning_rate": 8.987461100813729e-07, + "loss": 0.5533, + "num_tokens": 585615153.0, + "reward": 0.9453125, + "reward_std": 0.285142183303833, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.880859375, + "rewards/tag_count_reward/std": 0.25501781702041626, "step": 868 }, { @@ -25187,27 +25187,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 759.9921875, - "completions/mean_terminated_length": 747.2899780273438, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/max_terminated_length": 1909.0, + "completions/mean_length": 1048.205078125, + "completions/mean_terminated_length": 905.3772583007812, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, "epoch": 0.29666296833660494, - "grad_norm": 1.4316259622573853, - "kl": 2.7890625, - "learning_rate": 8.981414903743423e-07, - "loss": 0.1238, - "num_tokens": 532112527.0, - "reward": 0.9697265625, - "reward_std": 0.24940599501132965, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.1865164041519165, + "grad_norm": 20.983936309814453, + "kl": 7.953125, + "learning_rate": 8.984069288217385e-07, + "loss": 0.5639, + "num_tokens": 586228698.0, + "reward": 0.95361328125, + "reward_std": 0.2943935692310333, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89111328125, + "rewards/tag_count_reward/std": 0.25070181488990784, "step": 869 }, { @@ -25216,27 +25216,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.130859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1997.0, - "completions/mean_length": 747.65625, - "completions/mean_terminated_length": 729.6317138671875, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 1952.0, + "completions/mean_length": 971.814453125, + "completions/mean_terminated_length": 809.7820434570312, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.29700435264999575, - "grad_norm": 2.9305245876312256, - "kl": 2.56640625, - "learning_rate": 8.97801556847624e-07, - "loss": 0.1271, - "num_tokens": 532564767.0, - "reward": 1.0126953125, - "reward_std": 0.2796088457107544, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.19705891609191895, + "grad_norm": 22.400407791137695, + "kl": 7.6953125, + "learning_rate": 8.980672526869323e-07, + "loss": 0.5867, + "num_tokens": 586795707.0, + "reward": 1.06640625, + "reward_std": 0.3414897620677948, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.24652054905891418, "step": 870 }, { @@ -25245,27 +25245,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.11328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1794.0, - "completions/mean_length": 787.693359375, - "completions/mean_terminated_length": 762.587646484375, - "completions/min_length": 27.0, - "completions/min_terminated_length": 27.0, + "completions/max_terminated_length": 1707.0, + "completions/mean_length": 989.60546875, + "completions/mean_terminated_length": 854.3920288085938, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, "epoch": 0.2973457369633865, - "grad_norm": 4.79686164855957, - "kl": 2.396484375, - "learning_rate": 8.974611296802096e-07, - "loss": 0.173, - "num_tokens": 533046402.0, - "reward": 0.982421875, - "reward_std": 0.2162947952747345, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.1696154922246933, + "grad_norm": 15.670980453491211, + "kl": 5.828125, + "learning_rate": 8.977270821594285e-07, + "loss": 0.4709, + "num_tokens": 587380721.0, + "reward": 0.9560546875, + "reward_std": 0.270131915807724, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.2323758900165558, "step": 871 }, { @@ -25274,27 +25274,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1917.0, - "completions/mean_length": 740.326171875, - "completions/mean_terminated_length": 727.4299926757812, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 957.123046875, + "completions/mean_terminated_length": 869.6687622070312, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.2976871212767773, - "grad_norm": 2.1338069438934326, - "kl": 2.603515625, - "learning_rate": 8.971202093552731e-07, - "loss": 0.169, - "num_tokens": 533502761.0, - "reward": 0.99951171875, - "reward_std": 0.271058589220047, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.1771378070116043, + "grad_norm": 27.98681640625, + "kl": 4.8125, + "learning_rate": 8.973864177224031e-07, + "loss": 0.4214, + "num_tokens": 587948080.0, + "reward": 1.02978515625, + "reward_std": 0.2903249263763428, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.2064477950334549, "step": 872 }, { @@ -25303,27 +25303,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.005859375, + "completions/clipped_ratio": 0.138671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1949.0, - "completions/mean_length": 706.083984375, - "completions/mean_terminated_length": 698.1748657226562, - "completions/min_length": 39.0, - "completions/min_terminated_length": 39.0, + "completions/max_terminated_length": 1681.0, + "completions/mean_length": 977.541015625, + "completions/mean_terminated_length": 805.1995849609375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.29802850559016814, - "grad_norm": 4.597038745880127, - "kl": 2.5234375, - "learning_rate": 8.967787963566887e-07, - "loss": 0.1367, - "num_tokens": 533941108.0, - "reward": 1.0380859375, - "reward_std": 0.2726272642612457, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, + "grad_norm": 11.250933647155762, + "kl": 6.34375, + "learning_rate": 8.970452598597341e-07, + "loss": 0.4883, + "num_tokens": 588525413.0, + "reward": 1.02685546875, + "reward_std": 0.3251800537109375, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.18731388449668884, + "rewards/tag_count_reward/mean": 0.88818359375, + "rewards/tag_count_reward/std": 0.25136032700538635, "step": 873 }, { @@ -25332,27 +25332,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.13671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1844.0, - "completions/mean_length": 755.275390625, - "completions/mean_terminated_length": 739.9466552734375, - "completions/min_length": 16.0, - "completions/min_terminated_length": 16.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1030.802734375, + "completions/mean_terminated_length": 869.7081909179688, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.29836988990355895, - "grad_norm": 0.946505069732666, - "kl": 2.72265625, - "learning_rate": 8.964368911690296e-07, - "loss": 0.1177, - "num_tokens": 534400353.0, - "reward": 0.974609375, - "reward_std": 0.24221986532211304, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.1822277307510376, + "grad_norm": 8.957963943481445, + "kl": 7.34375, + "learning_rate": 8.967036090560001e-07, + "loss": 0.5683, + "num_tokens": 589125728.0, + "reward": 0.97607421875, + "reward_std": 0.2756907343864441, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89013671875, + "rewards/tag_count_reward/std": 0.2483120709657669, "step": 874 }, { @@ -25361,27 +25361,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.17578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1928.0, - "completions/mean_length": 749.759765625, - "completions/mean_terminated_length": 731.764404296875, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1067.927734375, + "completions/mean_terminated_length": 858.9075927734375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, "epoch": 0.2987112742169497, - "grad_norm": 4.6282057762146, - "kl": 3.8125, - "learning_rate": 8.960944942775675e-07, - "loss": 0.1705, - "num_tokens": 534876454.0, - "reward": 1.01416015625, - "reward_std": 0.26385408639907837, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.1713578850030899, + "grad_norm": 39.137306213378906, + "kl": 10.5078125, + "learning_rate": 8.963614657964798e-07, + "loss": 0.6851, + "num_tokens": 589764731.0, + "reward": 0.94873046875, + "reward_std": 0.32423490285873413, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.85107421875, + "rewards/tag_count_reward/std": 0.283108651638031, "step": 875 }, { @@ -25390,27 +25390,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.138671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 717.359375, - "completions/mean_terminated_length": 706.8818969726562, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 1663.0, + "completions/mean_length": 993.515625, + "completions/mean_terminated_length": 823.7460327148438, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.2990526585303405, - "grad_norm": 5.376757621765137, - "kl": 4.5859375, - "learning_rate": 8.957516061682724e-07, - "loss": 0.2, - "num_tokens": 535316814.0, - "reward": 0.9931640625, - "reward_std": 0.2733707129955292, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, + "grad_norm": 10.723176002502441, + "kl": 7.2109375, + "learning_rate": 8.960188305671515e-07, + "loss": 0.5668, + "num_tokens": 590346483.0, + "reward": 1.01171875, + "reward_std": 0.3036569356918335, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.8916015625, - "rewards/tag_count_reward/std": 0.20741577446460724, + "rewards/tag_count_reward/mean": 0.892578125, + "rewards/tag_count_reward/std": 0.24317210912704468, "step": 876 }, { @@ -25419,27 +25419,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1849.0, - "completions/mean_length": 678.541015625, - "completions/mean_terminated_length": 659.5584106445312, - "completions/min_length": 87.0, - "completions/min_terminated_length": 87.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 934.328125, + "completions/mean_terminated_length": 763.7658081054688, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.29939404284373133, - "grad_norm": 2.8910319805145264, - "kl": 4.19140625, - "learning_rate": 8.954082273278112e-07, - "loss": 0.229, - "num_tokens": 535744499.0, - "reward": 0.98779296875, - "reward_std": 0.26597756147384644, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.90576171875, - "rewards/tag_count_reward/std": 0.20036904513835907, + "grad_norm": 7.5688252449035645, + "kl": 7.703125, + "learning_rate": 8.956757038546925e-07, + "loss": 0.6007, + "num_tokens": 590905131.0, + "reward": 0.97509765625, + "reward_std": 0.2729474902153015, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89697265625, + "rewards/tag_count_reward/std": 0.24281376600265503, "step": 877 }, { @@ -25448,27 +25448,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.005859375, + "completions/clipped_ratio": 0.138671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1665.0, - "completions/mean_length": 761.509765625, - "completions/mean_terminated_length": 753.9273071289062, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1039.57421875, + "completions/mean_terminated_length": 877.219970703125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, "epoch": 0.29973542715712215, - "grad_norm": 3.734360694885254, - "kl": 3.84765625, - "learning_rate": 8.950643582435474e-07, - "loss": 0.1729, - "num_tokens": 536209304.0, - "reward": 0.98095703125, - "reward_std": 0.27370643615722656, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.017578125, - "rewards/format_reward/std": 0.13154059648513794, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.18806926906108856, + "grad_norm": 6.602210521697998, + "kl": 6.859375, + "learning_rate": 8.953320861464777e-07, + "loss": 0.5231, + "num_tokens": 591512305.0, + "reward": 0.9482421875, + "reward_std": 0.27819323539733887, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8916015625, + "rewards/tag_count_reward/std": 0.2437431812286377, "step": 878 }, { @@ -25477,27 +25477,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.189453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 709.39453125, - "completions/mean_terminated_length": 698.8543090820312, - "completions/min_length": 53.0, - "completions/min_terminated_length": 53.0, + "completions/max_terminated_length": 1871.0, + "completions/mean_length": 1073.06640625, + "completions/mean_terminated_length": 845.1903686523438, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.3000768114705129, - "grad_norm": 1.302786111831665, - "kl": 2.28515625, - "learning_rate": 8.9471999940354e-07, - "loss": 0.1255, - "num_tokens": 536660162.0, - "reward": 1.046875, - "reward_std": 0.2732976973056793, - "rewards/accuracy_reward/mean": 0.1088709682226181, - "rewards/accuracy_reward/std": 0.31179171800613403, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.15794700384140015, + "grad_norm": 7.395599842071533, + "kl": 7.3984375, + "learning_rate": 8.949879779305801e-07, + "loss": 0.5957, + "num_tokens": 592149363.0, + "reward": 0.9736328125, + "reward_std": 0.3537002503871918, + "rewards/accuracy_reward/mean": 0.12903225421905518, + "rewards/accuracy_reward/std": 0.33557409048080444, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8486328125, + "rewards/tag_count_reward/std": 0.2824586033821106, "step": 879 }, { @@ -25506,27 +25506,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.15234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1938.0, - "completions/mean_length": 777.19921875, - "completions/mean_terminated_length": 764.6666870117188, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1041.638671875, + "completions/mean_terminated_length": 860.7719116210938, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.3004181957839037, - "grad_norm": 3.1007723808288574, - "kl": 2.30859375, - "learning_rate": 8.943751512965437e-07, - "loss": 0.1278, - "num_tokens": 537154328.0, - "reward": 1.0029296875, - "reward_std": 0.23788556456565857, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, + "grad_norm": 6.8136115074157715, + "kl": 6.75, + "learning_rate": 8.946433796957683e-07, + "loss": 0.5432, + "num_tokens": 592778922.0, + "reward": 0.95166015625, + "reward_std": 0.30998027324676514, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.16440613567829132, + "rewards/tag_count_reward/mean": 0.88134765625, + "rewards/tag_count_reward/std": 0.24967442452907562, "step": 880 }, { @@ -25535,27 +25535,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.158203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1622.0, - "completions/mean_length": 667.107421875, - "completions/mean_terminated_length": 656.2342529296875, - "completions/min_length": 25.0, - "completions/min_terminated_length": 25.0, + "completions/max_terminated_length": 1728.0, + "completions/mean_length": 996.74609375, + "completions/mean_terminated_length": 799.1786499023438, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.30075958009729453, - "grad_norm": 3.776506185531616, - "kl": 1.515625, - "learning_rate": 8.940298144120074e-07, - "loss": 0.104, - "num_tokens": 537567663.0, - "reward": 1.07275390625, - "reward_std": 0.2482367753982544, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.95751953125, - "rewards/tag_count_reward/std": 0.13733947277069092, + "grad_norm": 6.437531471252441, + "kl": 7.4375, + "learning_rate": 8.942982919315083e-07, + "loss": 0.5882, + "num_tokens": 593361032.0, + "reward": 0.9853515625, + "reward_std": 0.33255141973495483, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8701171875, + "rewards/tag_count_reward/std": 0.26445597410202026, "step": 881 }, { @@ -25564,27 +25564,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.189453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 761.78515625, - "completions/mean_terminated_length": 738.7713623046875, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/max_terminated_length": 1702.0, + "completions/mean_length": 1078.05078125, + "completions/mean_terminated_length": 851.3397827148438, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.30110096441068535, - "grad_norm": 1.399491310119629, - "kl": 2.953125, - "learning_rate": 8.936839892400732e-07, - "loss": 0.1791, - "num_tokens": 538037409.0, - "reward": 1.07373046875, - "reward_std": 0.3079867362976074, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.16716907918453217, + "grad_norm": 11.650907516479492, + "kl": 8.78125, + "learning_rate": 8.939527151279606e-07, + "loss": 0.6106, + "num_tokens": 593992706.0, + "reward": 0.9814453125, + "reward_std": 0.3728793263435364, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8486328125, + "rewards/tag_count_reward/std": 0.2833233177661896, "step": 882 }, { @@ -25593,27 +25593,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.189453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1799.0, - "completions/mean_length": 818.599609375, - "completions/mean_terminated_length": 804.0217895507812, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1151.2421875, + "completions/mean_terminated_length": 941.6385498046875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.3014423487240761, - "grad_norm": 3.5203235149383545, - "kl": 2.138671875, - "learning_rate": 8.93337676271577e-07, - "loss": 0.1362, - "num_tokens": 538535348.0, - "reward": 1.0615234375, - "reward_std": 0.24921491742134094, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.9482421875, - "rewards/tag_count_reward/std": 0.14757278561592102, + "grad_norm": 7.056835174560547, + "kl": 8.3515625, + "learning_rate": 8.9360664977598e-07, + "loss": 0.5867, + "num_tokens": 594660958.0, + "reward": 0.9560546875, + "reward_std": 0.36406293511390686, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8408203125, + "rewards/tag_count_reward/std": 0.29017457365989685, "step": 883 }, { @@ -25622,27 +25622,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.142578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1845.0, - "completions/mean_length": 756.958984375, - "completions/mean_terminated_length": 744.226806640625, - "completions/min_length": 86.0, - "completions/min_terminated_length": 86.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1051.873046875, + "completions/mean_terminated_length": 886.2301025390625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.3017837330374669, - "grad_norm": 1.0830971002578735, - "kl": 2.2578125, - "learning_rate": 8.929908759980467e-07, - "loss": 0.1088, - "num_tokens": 539001279.0, - "reward": 1.01953125, - "reward_std": 0.25354111194610596, - "rewards/accuracy_reward/mean": 0.08266129344701767, - "rewards/accuracy_reward/std": 0.2756475806236267, + "grad_norm": 5.1051836013793945, + "kl": 6.8125, + "learning_rate": 8.932600963671164e-07, + "loss": 0.5304, + "num_tokens": 595277885.0, + "reward": 0.97705078125, + "reward_std": 0.32916760444641113, + "rewards/accuracy_reward/mean": 0.09879032522439957, + "rewards/accuracy_reward/std": 0.2986815273761749, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.16025325655937195, + "rewards/tag_count_reward/mean": 0.88134765625, + "rewards/tag_count_reward/std": 0.25211191177368164, "step": 884 }, { @@ -25651,27 +25651,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.177734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 740.443359375, - "completions/mean_terminated_length": 724.9387817382812, - "completions/min_length": 7.0, - "completions/min_terminated_length": 7.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1134.056640625, + "completions/mean_terminated_length": 936.5059814453125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, "epoch": 0.30212511735085773, - "grad_norm": 2.659788131713867, - "kl": 3.5234375, - "learning_rate": 8.926435889117019e-07, - "loss": 0.1967, - "num_tokens": 539454610.0, - "reward": 1.0634765625, - "reward_std": 0.2688376307487488, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310528099536896, + "grad_norm": 5.304638862609863, + "kl": 8.671875, + "learning_rate": 8.92913055393612e-07, + "loss": 0.6128, + "num_tokens": 595932746.0, + "reward": 0.9892578125, + "reward_std": 0.3918631672859192, + "rewards/accuracy_reward/mean": 0.1391129046678543, + "rewards/accuracy_reward/std": 0.3464137017726898, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.16208821535110474, + "rewards/tag_count_reward/mean": 0.8544921875, + "rewards/tag_count_reward/std": 0.27770963311195374, "step": 885 }, { @@ -25680,27 +25680,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.001953125, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 695.8828125, - "completions/mean_terminated_length": 693.2367553710938, - "completions/min_length": 83.0, - "completions/min_terminated_length": 83.0, + "completions/max_terminated_length": 1625.0, + "completions/mean_length": 999.796875, + "completions/mean_terminated_length": 860.6548461914062, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, "epoch": 0.30246650166424854, - "grad_norm": 3.676887035369873, - "kl": 4.10546875, - "learning_rate": 8.922958155054527e-07, - "loss": 0.2062, - "num_tokens": 539886326.0, - "reward": 1.02001953125, - "reward_std": 0.26067638397216797, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.17183120548725128, + "grad_norm": 7.674163341522217, + "kl": 7.3515625, + "learning_rate": 8.925655273484015e-07, + "loss": 0.53, + "num_tokens": 596520066.0, + "reward": 1.0, + "reward_std": 0.3266477882862091, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.2355148047208786, "step": 886 }, { @@ -25709,27 +25709,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 699.330078125, - "completions/mean_terminated_length": 680.6356811523438, - "completions/min_length": 25.0, - "completions/min_terminated_length": 25.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 1067.90234375, + "completions/mean_terminated_length": 917.7973022460938, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.3028078859776393, - "grad_norm": 6.330347537994385, - "kl": 4.8984375, - "learning_rate": 8.919475562729004e-07, - "loss": 0.2743, - "num_tokens": 540331279.0, - "reward": 1.0302734375, - "reward_std": 0.28771382570266724, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.18753820657730103, + "grad_norm": 13.764772415161133, + "kl": 8.22265625, + "learning_rate": 8.922175127251119e-07, + "loss": 0.545, + "num_tokens": 597153728.0, + "reward": 1.02587890625, + "reward_std": 0.33099833130836487, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88134765625, + "rewards/tag_count_reward/std": 0.25548529624938965, "step": 887 }, { @@ -25738,27 +25738,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.169921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 756.12109375, - "completions/mean_terminated_length": 717.1307373046875, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 1907.0, + "completions/mean_length": 1131.927734375, + "completions/mean_terminated_length": 944.40234375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, "epoch": 0.3031492702910301, - "grad_norm": 5.038356304168701, - "kl": 3.91796875, - "learning_rate": 8.915988117083351e-07, - "loss": 0.2365, - "num_tokens": 540804093.0, - "reward": 0.9609375, - "reward_std": 0.21736985445022583, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.17832663655281067, + "grad_norm": 8.888741493225098, + "kl": 8.8359375, + "learning_rate": 8.918690120180612e-07, + "loss": 0.6003, + "num_tokens": 597818955.0, + "reward": 0.9228515625, + "reward_std": 0.3350474238395691, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8564453125, + "rewards/tag_count_reward/std": 0.2774066627025604, "step": 888 }, { @@ -25767,27 +25767,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 698.138671875, - "completions/mean_terminated_length": 668.5009765625, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 968.130859375, + "completions/mean_terminated_length": 840.81005859375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, "epoch": 0.30349065460442093, - "grad_norm": 2.2378509044647217, - "kl": 4.10546875, - "learning_rate": 8.912495823067356e-07, - "loss": 0.2691, - "num_tokens": 541235924.0, - "reward": 1.0390625, - "reward_std": 0.252128005027771, - "rewards/accuracy_reward/mean": 0.10685484111309052, - "rewards/accuracy_reward/std": 0.30924052000045776, + "grad_norm": 7.578829765319824, + "kl": 7.375, + "learning_rate": 8.915200257222579e-07, + "loss": 0.5343, + "num_tokens": 598389022.0, + "reward": 1.03515625, + "reward_std": 0.2941245138645172, + "rewards/accuracy_reward/mean": 0.13508065044879913, + "rewards/accuracy_reward/std": 0.34215477108955383, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.16843964159488678, + "rewards/tag_count_reward/mean": 0.904296875, + "rewards/tag_count_reward/std": 0.23328903317451477, "step": 889 }, { @@ -25796,27 +25796,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 722.943359375, - "completions/mean_terminated_length": 701.9107666015625, - "completions/min_length": 9.0, - "completions/min_terminated_length": 9.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 988.88671875, + "completions/mean_terminated_length": 886.830810546875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.30383203891781174, - "grad_norm": 4.78679895401001, - "kl": 3.390625, - "learning_rate": 8.908998685637696e-07, - "loss": 0.2461, - "num_tokens": 541677319.0, - "reward": 1.05322265625, - "reward_std": 0.2654269337654114, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.93994140625, - "rewards/tag_count_reward/std": 0.16531828045845032, + "grad_norm": 5.319472312927246, + "kl": 4.80859375, + "learning_rate": 8.911705543333998e-07, + "loss": 0.3631, + "num_tokens": 598966580.0, + "reward": 1.07568359375, + "reward_std": 0.2701775133609772, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.22333602607250214, "step": 890 }, { @@ -25825,27 +25825,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1810.0, - "completions/mean_length": 801.087890625, - "completions/mean_terminated_length": 742.4396362304688, - "completions/min_length": 242.0, - "completions/min_terminated_length": 242.0, + "completions/max_terminated_length": 1916.0, + "completions/mean_length": 1076.03125, + "completions/mean_terminated_length": 927.1712036132812, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, "epoch": 0.3041734232312025, - "grad_norm": 7.234511375427246, - "kl": 4.8125, - "learning_rate": 8.905496709757917e-07, - "loss": 0.3774, - "num_tokens": 542164548.0, - "reward": 0.9912109375, - "reward_std": 0.27330106496810913, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, + "grad_norm": 7.011833667755127, + "kl": 7.03125, + "learning_rate": 8.908205983478742e-07, + "loss": 0.4924, + "num_tokens": 599594580.0, + "reward": 1.01904296875, + "reward_std": 0.3448503613471985, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.19395042955875397, + "rewards/tag_count_reward/mean": 0.89013671875, + "rewards/tag_count_reward/std": 0.25027456879615784, "step": 891 }, { @@ -25854,27 +25854,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.119140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1940.0, - "completions/mean_length": 770.48046875, - "completions/mean_terminated_length": 734.5662231445312, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1101.150390625, + "completions/mean_terminated_length": 973.084228515625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, "epoch": 0.3045148075445933, - "grad_norm": 3.368119955062866, - "kl": 4.32421875, - "learning_rate": 8.90198990039843e-07, - "loss": 0.2899, - "num_tokens": 542626170.0, - "reward": 0.96630859375, - "reward_std": 0.23860511183738708, - "rewards/accuracy_reward/mean": 0.052419353276491165, - "rewards/accuracy_reward/std": 0.22309619188308716, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.19377975165843964, + "grad_norm": 5.50002908706665, + "kl": 5.5390625, + "learning_rate": 8.904701582627566e-07, + "loss": 0.4239, + "num_tokens": 600225505.0, + "reward": 0.9755859375, + "reward_std": 0.25669339299201965, + "rewards/accuracy_reward/mean": 0.08467742055654526, + "rewards/accuracy_reward/std": 0.278682142496109, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8935546875, + "rewards/tag_count_reward/std": 0.2451036423444748, "step": 892 }, { @@ -25883,27 +25883,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.177734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1949.0, - "completions/mean_length": 731.869140625, - "completions/mean_terminated_length": 700.2820434570312, - "completions/min_length": 29.0, - "completions/min_terminated_length": 29.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1131.09375, + "completions/mean_terminated_length": 932.9026489257812, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.3048561918579841, - "grad_norm": 3.3479795455932617, - "kl": 3.55859375, - "learning_rate": 8.898478262536513e-07, - "loss": 0.1946, - "num_tokens": 543078279.0, - "reward": 1.009765625, - "reward_std": 0.2593262195587158, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.18677493929862976, + "grad_norm": 6.948239803314209, + "kl": 7.1484375, + "learning_rate": 8.901192345758098e-07, + "loss": 0.4985, + "num_tokens": 600882017.0, + "reward": 0.9365234375, + "reward_std": 0.3049319386482239, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8544921875, + "rewards/tag_count_reward/std": 0.28207945823669434, "step": 893 }, { @@ -25912,27 +25912,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 782.36328125, - "completions/mean_terminated_length": 757.1514282226562, - "completions/min_length": 62.0, - "completions/min_terminated_length": 62.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1132.056640625, + "completions/mean_terminated_length": 1010.4712524414062, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, "epoch": 0.30519757617137494, - "grad_norm": 2.381742477416992, - "kl": 3.73046875, - "learning_rate": 8.894961801156292e-07, - "loss": 0.1785, - "num_tokens": 543560577.0, - "reward": 1.02294921875, - "reward_std": 0.29584503173828125, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.18024997413158417, + "grad_norm": 6.720944404602051, + "kl": 4.55859375, + "learning_rate": 8.897678277854837e-07, + "loss": 0.354, + "num_tokens": 601543358.0, + "reward": 1.03125, + "reward_std": 0.33499640226364136, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.904296875, + "rewards/tag_count_reward/std": 0.23847422003746033, "step": 894 }, { @@ -25941,27 +25941,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.15234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1905.0, - "completions/mean_length": 769.951171875, - "completions/mean_terminated_length": 739.2780151367188, - "completions/min_length": 15.0, - "completions/min_terminated_length": 15.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1094.490234375, + "completions/mean_terminated_length": 923.1221313476562, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.3055389604847657, - "grad_norm": 2.093144655227661, - "kl": 4.19921875, - "learning_rate": 8.891440521248742e-07, - "loss": 0.2567, - "num_tokens": 544023448.0, - "reward": 1.08447265625, - "reward_std": 0.2756364345550537, - "rewards/accuracy_reward/mean": 0.146484375, - "rewards/accuracy_reward/std": 0.35393697023391724, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.1704016625881195, + "grad_norm": 6.425414085388184, + "kl": 5.8515625, + "learning_rate": 8.894159383909151e-07, + "loss": 0.4299, + "num_tokens": 602172393.0, + "reward": 1.0458984375, + "reward_std": 0.3278510570526123, + "rewards/accuracy_reward/mean": 0.177734375, + "rewards/accuracy_reward/std": 0.3826628625392914, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8681640625, + "rewards/tag_count_reward/std": 0.2667154371738434, "step": 895 }, { @@ -25970,27 +25970,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 769.62109375, - "completions/mean_terminated_length": 738.9400634765625, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1080.91015625, + "completions/mean_terminated_length": 952.535400390625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.3058803447981565, - "grad_norm": 1.946558952331543, - "kl": 4.0078125, - "learning_rate": 8.887914427811676e-07, - "loss": 0.2431, - "num_tokens": 544490358.0, - "reward": 1.00439453125, - "reward_std": 0.24462217092514038, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.16818346083164215, + "grad_norm": 7.977920055389404, + "kl": 5.03125, + "learning_rate": 8.890635668919249e-07, + "loss": 0.404, + "num_tokens": 602798683.0, + "reward": 0.994140625, + "reward_std": 0.28986674547195435, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.900390625, + "rewards/tag_count_reward/std": 0.23164485394954681, "step": 896 }, { @@ -25999,27 +25999,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 781.759765625, - "completions/mean_terminated_length": 769.272216796875, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1134.1484375, + "completions/mean_terminated_length": 1021.9210815429688, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, "epoch": 0.3062217291115473, - "grad_norm": 1.506081461906433, - "kl": 2.919921875, - "learning_rate": 8.884383525849736e-07, - "loss": 0.1543, - "num_tokens": 544980043.0, - "reward": 1.029296875, - "reward_std": 0.2404770702123642, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.17043600976467133, + "grad_norm": 6.829026699066162, + "kl": 4.07421875, + "learning_rate": 8.887107137890202e-07, + "loss": 0.3332, + "num_tokens": 603468791.0, + "reward": 1.03076171875, + "reward_std": 0.2743030786514282, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89013671875, + "rewards/tag_count_reward/std": 0.2408103197813034, "step": 897 }, { @@ -26028,27 +26028,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1922.0, - "completions/mean_length": 779.76171875, - "completions/mean_terminated_length": 757.0695190429688, - "completions/min_length": 248.0, - "completions/min_terminated_length": 248.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1077.876953125, + "completions/mean_terminated_length": 968.2108154296875, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, "epoch": 0.30656311342493814, - "grad_norm": 1.7713485956192017, - "kl": 3.1015625, - "learning_rate": 8.880847820374395e-07, - "loss": 0.1911, - "num_tokens": 545451569.0, - "reward": 1.01708984375, - "reward_std": 0.22191838920116425, - "rewards/accuracy_reward/mean": 0.07500000298023224, - "rewards/accuracy_reward/std": 0.26366615295410156, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.15004098415374756, + "grad_norm": 5.950887680053711, + "kl": 5.1484375, + "learning_rate": 8.883573795833909e-07, + "loss": 0.4156, + "num_tokens": 604092952.0, + "reward": 1.00341796875, + "reward_std": 0.29253336787223816, + "rewards/accuracy_reward/mean": 0.1145833358168602, + "rewards/accuracy_reward/std": 0.3188507556915283, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89599609375, + "rewards/tag_count_reward/std": 0.24189116060733795, "step": 898 }, { @@ -26057,27 +26057,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.111328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1772.0, - "completions/mean_length": 726.482421875, - "completions/mean_terminated_length": 716.0767822265625, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1090.640625, + "completions/mean_terminated_length": 970.707763671875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.3069044977383289, - "grad_norm": 1.510651707649231, - "kl": 2.796875, - "learning_rate": 8.877307316403936e-07, - "loss": 0.1611, - "num_tokens": 545899336.0, - "reward": 0.99267578125, - "reward_std": 0.26471325755119324, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.17319931089878082, + "grad_norm": 2.589440107345581, + "kl": 5.0546875, + "learning_rate": 8.88003564776911e-07, + "loss": 0.3558, + "num_tokens": 604727168.0, + "reward": 0.962890625, + "reward_std": 0.273385226726532, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.892578125, + "rewards/tag_count_reward/std": 0.24317210912704468, "step": 899 }, { @@ -26086,27 +26086,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.00390625, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1952.0, - "completions/mean_length": 716.80078125, - "completions/mean_terminated_length": 711.5804443359375, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 965.03125, + "completions/mean_terminated_length": 890.4217529296875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.3072458820517197, - "grad_norm": 1.4322909116744995, - "kl": 3.0078125, - "learning_rate": 8.87376201896346e-07, - "loss": 0.205, - "num_tokens": 546336914.0, - "reward": 0.9833984375, - "reward_std": 0.19191157817840576, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.15593473613262177, + "grad_norm": 4.575494766235352, + "kl": 3.7578125, + "learning_rate": 8.876492698721374e-07, + "loss": 0.2992, + "num_tokens": 605291840.0, + "reward": 0.98388671875, + "reward_std": 0.23414167761802673, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.20129208266735077, "step": 900 }, { @@ -26115,27 +26115,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 818.115234375, - "completions/mean_terminated_length": 793.6155395507812, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1088.13671875, + "completions/mean_terminated_length": 979.63037109375, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, "epoch": 0.3075872663651105, - "grad_norm": 1.4152556657791138, - "kl": 4.125, - "learning_rate": 8.870211933084868e-07, - "loss": 0.2409, - "num_tokens": 546841805.0, - "reward": 0.947265625, - "reward_std": 0.20224609971046448, - "rewards/accuracy_reward/mean": 0.017578125, - "rewards/accuracy_reward/std": 0.13154059648513794, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.1806689351797104, + "grad_norm": 5.628513336181641, + "kl": 6.2265625, + "learning_rate": 8.872944953723079e-07, + "loss": 0.4872, + "num_tokens": 605934982.0, + "reward": 0.9248046875, + "reward_std": 0.2410810887813568, + "rewards/accuracy_reward/mean": 0.025390625, + "rewards/accuracy_reward/std": 0.15746226906776428, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8994140625, + "rewards/tag_count_reward/std": 0.234895259141922, "step": 901 }, { @@ -26144,27 +26144,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 699.349609375, - "completions/mean_terminated_length": 683.3577270507812, - "completions/min_length": 14.0, - "completions/min_terminated_length": 14.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 913.419921875, + "completions/mean_terminated_length": 830.1697998046875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.30792865067850134, - "grad_norm": 2.7752535343170166, - "kl": 3.45703125, - "learning_rate": 8.866657063806859e-07, - "loss": 0.2088, - "num_tokens": 547282480.0, - "reward": 1.00244140625, - "reward_std": 0.20316222310066223, - "rewards/accuracy_reward/mean": 0.06451612710952759, - "rewards/accuracy_reward/std": 0.2459181249141693, + "grad_norm": 4.82847261428833, + "kl": 5.296875, + "learning_rate": 8.869392417813427e-07, + "loss": 0.4128, + "num_tokens": 606485261.0, + "reward": 1.01953125, + "reward_std": 0.285092294216156, + "rewards/accuracy_reward/mean": 0.11088709533214569, + "rewards/accuracy_reward/std": 0.3143092691898346, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.93994140625, - "rewards/tag_count_reward/std": 0.15774647891521454, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.21522264182567596, "step": 902 }, { @@ -26173,27 +26173,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1846.0, - "completions/mean_length": 693.265625, - "completions/mean_terminated_length": 677.2015991210938, - "completions/min_length": 82.0, - "completions/min_terminated_length": 82.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 901.25390625, + "completions/mean_terminated_length": 837.4144897460938, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.30827003499189215, - "grad_norm": 2.296919822692871, - "kl": 3.279296875, - "learning_rate": 8.863097416174916e-07, - "loss": 0.2034, - "num_tokens": 547717336.0, - "reward": 1.04296875, - "reward_std": 0.23253028094768524, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, + "grad_norm": 3.645758867263794, + "kl": 4.8515625, + "learning_rate": 8.865835096038413e-07, + "loss": 0.3805, + "num_tokens": 607026607.0, + "reward": 1.064453125, + "reward_std": 0.2538946270942688, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.16021747887134552, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.17900052666664124, "step": 903 }, { @@ -26202,27 +26202,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 749.271484375, - "completions/mean_terminated_length": 739.0452880859375, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 1933.0, + "completions/mean_length": 972.806640625, + "completions/mean_terminated_length": 861.5797119140625, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, "epoch": 0.3086114193052829, - "grad_norm": 1.4624488353729248, - "kl": 3.84375, - "learning_rate": 8.859532995241309e-07, - "loss": 0.1991, - "num_tokens": 548182931.0, - "reward": 1.00341796875, - "reward_std": 0.2580622732639313, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.17351900041103363, + "grad_norm": 134.0551300048828, + "kl": 9.5390625, + "learning_rate": 8.862272993450842e-07, + "loss": 0.5898, + "num_tokens": 607606652.0, + "reward": 1.04541015625, + "reward_std": 0.29250040650367737, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.20940732955932617, "step": 904 }, { @@ -26231,27 +26231,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.00390625, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1840.0, - "completions/mean_length": 712.423828125, - "completions/mean_terminated_length": 707.1863403320312, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 893.693359375, + "completions/mean_terminated_length": 851.6336059570312, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, "epoch": 0.3089528036186737, - "grad_norm": 2.0424282550811768, - "kl": 3.1484375, - "learning_rate": 8.855963806065085e-07, - "loss": 0.1681, - "num_tokens": 548622588.0, - "reward": 1.044921875, - "reward_std": 0.23043325543403625, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, + "grad_norm": 2.665830135345459, + "kl": 3.2421875, + "learning_rate": 8.858706115110301e-07, + "loss": 0.2812, + "num_tokens": 608139119.0, + "reward": 1.10595703125, + "reward_std": 0.24411088228225708, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.16102655231952667, + "rewards/tag_count_reward/mean": 0.95361328125, + "rewards/tag_count_reward/std": 0.1561574786901474, "step": 905 }, { @@ -26260,27 +26260,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1875.0, - "completions/mean_length": 734.583984375, - "completions/mean_terminated_length": 697.66064453125, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 926.408203125, + "completions/mean_terminated_length": 846.6296997070312, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, "epoch": 0.30929418793206453, - "grad_norm": 1.9697262048721313, - "kl": 4.66796875, - "learning_rate": 8.85238985371205e-07, - "loss": 0.295, - "num_tokens": 549079399.0, - "reward": 0.978515625, - "reward_std": 0.23790127038955688, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, + "grad_norm": 5.131568431854248, + "kl": 5.7890625, + "learning_rate": 8.855134466083165e-07, + "loss": 0.4394, + "num_tokens": 608694144.0, + "reward": 1.03271484375, + "reward_std": 0.24678128957748413, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.18685677647590637, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.19716253876686096, "step": 906 }, { @@ -26291,25 +26291,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1904.0, - "completions/mean_length": 788.373046875, - "completions/mean_terminated_length": 758.1420288085938, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/max_terminated_length": 1741.0, + "completions/mean_length": 893.859375, + "completions/mean_terminated_length": 866.1600341796875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.30963557224545535, - "grad_norm": 2.7737860679626465, - "kl": 5.25, - "learning_rate": 8.848811143254779e-07, - "loss": 0.3418, - "num_tokens": 549560838.0, - "reward": 0.9716796875, - "reward_std": 0.23982642590999603, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, + "grad_norm": 2.074618339538574, + "kl": 2.1240234375, + "learning_rate": 8.851558051442581e-07, + "loss": 0.183, + "num_tokens": 609229592.0, + "reward": 1.08544921875, + "reward_std": 0.21182072162628174, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.19645671546459198, + "rewards/tag_count_reward/mean": 0.96826171875, + "rewards/tag_count_reward/std": 0.12932726740837097, "step": 907 }, { @@ -26318,27 +26318,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 702.029296875, - "completions/mean_terminated_length": 677.9462890625, - "completions/min_length": 95.0, - "completions/min_terminated_length": 95.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 816.53515625, + "completions/mean_terminated_length": 771.6640014648438, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.3099769565588461, - "grad_norm": 5.901828289031982, - "kl": 4.921875, - "learning_rate": 8.845227679772596e-07, - "loss": 0.303, - "num_tokens": 549997317.0, - "reward": 1.03466796875, - "reward_std": 0.3133937120437622, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.20397058129310608, + "grad_norm": 3.461775541305542, + "kl": 3.1337890625, + "learning_rate": 8.847976876268467e-07, + "loss": 0.2677, + "num_tokens": 609724698.0, + "reward": 1.130859375, + "reward_std": 0.2693468928337097, + "rewards/accuracy_reward/mean": 0.181640625, + "rewards/accuracy_reward/std": 0.38592514395713806, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.1613585203886032, "step": 908 }, { @@ -26347,27 +26347,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 690.173828125, - "completions/mean_terminated_length": 674.0731201171875, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 1616.0, + "completions/mean_length": 838.40234375, + "completions/mean_terminated_length": 799.383056640625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, "epoch": 0.3103183408722369, - "grad_norm": 2.511085271835327, - "kl": 3.02734375, - "learning_rate": 8.841639468351571e-07, - "loss": 0.1876, - "num_tokens": 550424270.0, - "reward": 1.0078125, - "reward_std": 0.252117395401001, - "rewards/accuracy_reward/mean": 0.06653226166963577, - "rewards/accuracy_reward/std": 0.2494617998600006, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.1540389060974121, + "grad_norm": 3.523098945617676, + "kl": 3.28515625, + "learning_rate": 8.844390945647507e-07, + "loss": 0.2595, + "num_tokens": 610227544.0, + "reward": 1.07373046875, + "reward_std": 0.2635180950164795, + "rewards/accuracy_reward/mean": 0.11895161122083664, + "rewards/accuracy_reward/std": 0.3240584135055542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95849609375, + "rewards/tag_count_reward/std": 0.15199612081050873, "step": 909 }, { @@ -26376,27 +26376,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 744.2109375, - "completions/mean_terminated_length": 718.2390747070312, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 1838.0, + "completions/mean_length": 872.697265625, + "completions/mean_terminated_length": 822.4297485351562, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.31065972518562773, - "grad_norm": 3.7137417793273926, - "kl": 3.53125, - "learning_rate": 8.838046514084516e-07, - "loss": 0.2435, - "num_tokens": 550879002.0, - "reward": 1.0166015625, - "reward_std": 0.2949827313423157, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.9111328125, - "rewards/tag_count_reward/std": 0.20069128274917603, + "grad_norm": 4.324607849121094, + "kl": 3.888671875, + "learning_rate": 8.840800264673133e-07, + "loss": 0.3143, + "num_tokens": 610748061.0, + "reward": 1.0703125, + "reward_std": 0.2996329665184021, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.17694957554340363, "step": 910 }, { @@ -26405,27 +26405,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 751.1796875, - "completions/mean_terminated_length": 725.3466186523438, - "completions/min_length": 78.0, - "completions/min_terminated_length": 78.0, + "completions/max_terminated_length": 1868.0, + "completions/mean_length": 851.470703125, + "completions/mean_terminated_length": 807.8724975585938, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.31100110949901855, - "grad_norm": 7.024240970611572, - "kl": 2.826171875, - "learning_rate": 8.834448822070971e-07, - "loss": 0.2561, - "num_tokens": 551344150.0, - "reward": 0.96875, - "reward_std": 0.2577192187309265, + "grad_norm": 26.543291091918945, + "kl": 3.73828125, + "learning_rate": 8.837204838445528e-07, + "loss": 0.2907, + "num_tokens": 611264558.0, + "reward": 0.97900390625, + "reward_std": 0.21171167492866516, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.1921909898519516, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.17468802630901337, "step": 911 }, { @@ -26434,27 +26434,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 740.40625, - "completions/mean_terminated_length": 690.0121459960938, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 1715.0, + "completions/mean_length": 781.34765625, + "completions/mean_terminated_length": 745.7389526367188, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.3113424938124093, - "grad_norm": 7.939428329467773, - "kl": 3.85546875, - "learning_rate": 8.830846397417202e-07, - "loss": 0.3628, - "num_tokens": 551798710.0, - "reward": 1.0, - "reward_std": 0.24333296716213226, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, + "grad_norm": 4.281364440917969, + "kl": 3.30859375, + "learning_rate": 8.833604672071616e-07, + "loss": 0.2657, + "num_tokens": 611740080.0, + "reward": 1.046875, + "reward_std": 0.1814068853855133, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.19499453902244568, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.16467617452144623, "step": 912 }, { @@ -26463,27 +26463,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 709.05859375, - "completions/mean_terminated_length": 690.4990234375, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/max_terminated_length": 1834.0, + "completions/mean_length": 763.65234375, + "completions/mean_terminated_length": 738.0677490234375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.3116838781258001, - "grad_norm": 2.98781681060791, - "kl": 3.34765625, - "learning_rate": 8.827239245236194e-07, - "loss": 0.2729, - "num_tokens": 552234644.0, - "reward": 1.041015625, - "reward_std": 0.21898271143436432, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.15143637359142303, + "grad_norm": 2.2851076126098633, + "kl": 2.1484375, + "learning_rate": 8.829999770665051e-07, + "loss": 0.1891, + "num_tokens": 612203966.0, + "reward": 1.06298828125, + "reward_std": 0.21350131928920746, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95556640625, + "rewards/tag_count_reward/std": 0.1527736783027649, "step": 913 }, { @@ -26492,27 +26492,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 731.732421875, - "completions/mean_terminated_length": 710.83935546875, - "completions/min_length": 71.0, - "completions/min_terminated_length": 71.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 822.48828125, + "completions/mean_terminated_length": 770.0733642578125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.31202526243919093, - "grad_norm": 1.9520854949951172, - "kl": 4.39453125, - "learning_rate": 8.823627370647634e-07, - "loss": 0.2713, - "num_tokens": 552692043.0, - "reward": 0.9521484375, - "reward_std": 0.20709767937660217, - "rewards/accuracy_reward/mean": 0.01953125, - "rewards/accuracy_reward/std": 0.1385180652141571, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.9287109375, - "rewards/tag_count_reward/std": 0.18117332458496094, + "grad_norm": 2.825894594192505, + "kl": 4.7421875, + "learning_rate": 8.826390139346213e-07, + "loss": 0.3893, + "num_tokens": 612707832.0, + "reward": 0.9775390625, + "reward_std": 0.2177228182554245, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.185324028134346, "step": 914 }, { @@ -26521,27 +26521,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, - "completions/mean_length": 748.013671875, - "completions/mean_terminated_length": 724.75341796875, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/mean_length": 822.037109375, + "completions/mean_terminated_length": 807.5000610351562, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, "epoch": 0.31236664675258174, - "grad_norm": 4.948241233825684, - "kl": 4.57421875, - "learning_rate": 8.820010778777925e-07, - "loss": 0.2511, - "num_tokens": 553156594.0, - "reward": 0.98681640625, - "reward_std": 0.23312708735466003, + "grad_norm": 2.263535737991333, + "kl": 2.84765625, + "learning_rate": 8.822775783242204e-07, + "loss": 0.2351, + "num_tokens": 613210283.0, + "reward": 0.99267578125, + "reward_std": 0.2031371295452118, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.1792825609445572, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.17468802630901337, "step": 915 }, { @@ -26550,27 +26550,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1781.0, - "completions/mean_length": 742.59375, - "completions/mean_terminated_length": 719.236572265625, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 866.1015625, + "completions/mean_terminated_length": 827.9757690429688, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, "epoch": 0.3127080310659725, - "grad_norm": 5.643583297729492, - "kl": 5.015625, - "learning_rate": 8.816389474760151e-07, - "loss": 0.2985, - "num_tokens": 553628130.0, - "reward": 1.0224609375, - "reward_std": 0.2850502133369446, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.19194971024990082, + "grad_norm": 2.470695972442627, + "kl": 2.732421875, + "learning_rate": 8.819156707486831e-07, + "loss": 0.2505, + "num_tokens": 613745055.0, + "reward": 1.02294921875, + "reward_std": 0.2798839211463928, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.1930980682373047, "step": 916 }, { @@ -26579,27 +26579,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 721.310546875, - "completions/mean_terminated_length": 705.5791015625, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 814.6328125, + "completions/mean_terminated_length": 790.0637817382812, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.3130494153793633, - "grad_norm": 3.775449514389038, - "kl": 4.61328125, - "learning_rate": 8.812763463734095e-07, - "loss": 0.2438, - "num_tokens": 554074641.0, - "reward": 0.990234375, - "reward_std": 0.244065523147583, - "rewards/accuracy_reward/mean": 0.06451612710952759, - "rewards/accuracy_reward/std": 0.2459181249141693, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.18652920424938202, + "grad_norm": 5.178702354431152, + "kl": 3.11328125, + "learning_rate": 8.81553291722061e-07, + "loss": 0.2328, + "num_tokens": 614239347.0, + "reward": 1.0263671875, + "reward_std": 0.2602751553058624, + "rewards/accuracy_reward/mean": 0.0947580635547638, + "rewards/accuracy_reward/std": 0.29317617416381836, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.17864517867565155, "step": 917 }, { @@ -26608,27 +26608,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.005859375, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 746.1796875, - "completions/mean_terminated_length": 738.5068969726562, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 807.880859375, + "completions/mean_terminated_length": 788.1964721679688, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.31339079969275413, - "grad_norm": 3.482968330383301, - "kl": 4.94140625, - "learning_rate": 8.809132750846214e-07, - "loss": 0.2725, - "num_tokens": 554530477.0, - "reward": 1.064453125, - "reward_std": 0.3038240075111389, + "grad_norm": 8.297091484069824, + "kl": 1.7978515625, + "learning_rate": 8.811904417590752e-07, + "loss": 0.1643, + "num_tokens": 614726774.0, + "reward": 1.080078125, + "reward_std": 0.2699255347251892, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.1726529598236084, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.951171875, + "rewards/tag_count_reward/std": 0.15658599138259888, "step": 918 }, { @@ -26637,27 +26637,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1877.0, - "completions/mean_length": 713.826171875, - "completions/mean_terminated_length": 700.6686401367188, - "completions/min_length": 76.0, - "completions/min_terminated_length": 76.0, + "completions/max_terminated_length": 1902.0, + "completions/mean_length": 783.92578125, + "completions/mean_terminated_length": 763.8611450195312, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.31373218400614494, - "grad_norm": 1.567186713218689, - "kl": 4.05078125, - "learning_rate": 8.805497341249642e-07, - "loss": 0.2236, - "num_tokens": 554972932.0, - "reward": 1.0068359375, - "reward_std": 0.24477024376392365, - "rewards/accuracy_reward/mean": 0.06854838877916336, - "rewards/accuracy_reward/std": 0.25293970108032227, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.17378656566143036, + "grad_norm": 4.479769706726074, + "kl": 2.4287109375, + "learning_rate": 8.808271213751157e-07, + "loss": 0.2414, + "num_tokens": 615205120.0, + "reward": 1.0107421875, + "reward_std": 0.21461519598960876, + "rewards/accuracy_reward/mean": 0.08467742055654526, + "rewards/accuracy_reward/std": 0.278682142496109, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.19670946896076202, "step": 919 }, { @@ -26666,27 +26666,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1838.0, - "completions/mean_length": 830.228515625, - "completions/mean_terminated_length": 795.9939575195312, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 849.8203125, + "completions/mean_terminated_length": 823.5130004882812, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.3140735683195357, - "grad_norm": 1.707174301147461, - "kl": 4.45703125, - "learning_rate": 8.801857240104179e-07, - "loss": 0.2765, - "num_tokens": 555473417.0, - "reward": 0.9892578125, - "reward_std": 0.26355940103530884, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.24230584502220154, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.18612663447856903, + "grad_norm": 6.861300945281982, + "kl": 3.349609375, + "learning_rate": 8.804633310862404e-07, + "loss": 0.334, + "num_tokens": 615715636.0, + "reward": 0.99072265625, + "reward_std": 0.28117692470550537, + "rewards/accuracy_reward/mean": 0.08870967477560043, + "rewards/accuracy_reward/std": 0.284611314535141, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.21579405665397644, "step": 920 }, { @@ -26695,27 +26695,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1885.0, - "completions/mean_length": 718.58203125, - "completions/mean_terminated_length": 705.471435546875, - "completions/min_length": 101.0, - "completions/min_terminated_length": 101.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 794.62890625, + "completions/mean_terminated_length": 769.661376953125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.3144149526329265, - "grad_norm": 1.327146053314209, - "kl": 3.484375, - "learning_rate": 8.798212452576282e-07, - "loss": 0.2167, - "num_tokens": 555922771.0, - "reward": 1.099609375, - "reward_std": 0.27388161420822144, - "rewards/accuracy_reward/mean": 0.15234375, - "rewards/accuracy_reward/std": 0.35970520973205566, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.16327762603759766, + "grad_norm": 7.27172327041626, + "kl": 2.513671875, + "learning_rate": 8.80099071409175e-07, + "loss": 0.2674, + "num_tokens": 616203926.0, + "reward": 1.11669921875, + "reward_std": 0.2874504327774048, + "rewards/accuracy_reward/mean": 0.185546875, + "rewards/accuracy_reward/std": 0.38912075757980347, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.19288021326065063, "step": 921 }, { @@ -26724,27 +26724,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 830.103515625, - "completions/mean_terminated_length": 800.8740234375, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 865.658203125, + "completions/mean_terminated_length": 858.6896362304688, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.3147563369463173, - "grad_norm": 2.0233733654022217, - "kl": 3.1015625, - "learning_rate": 8.794562983839058e-07, - "loss": 0.2271, - "num_tokens": 556432488.0, - "reward": 1.06103515625, - "reward_std": 0.2801864445209503, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.93798828125, - "rewards/tag_count_reward/std": 0.16384941339492798, + "grad_norm": 3.9548397064208984, + "kl": 2.27734375, + "learning_rate": 8.797343428613121e-07, + "loss": 0.223, + "num_tokens": 616731847.0, + "reward": 1.02587890625, + "reward_std": 0.2950492799282074, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90087890625, + "rewards/tag_count_reward/std": 0.217988058924675, "step": 922 }, { @@ -26753,27 +26753,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 749.255859375, - "completions/mean_terminated_length": 723.3844604492188, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 819.73828125, + "completions/mean_terminated_length": 787.739501953125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, "epoch": 0.31509772125970814, - "grad_norm": 5.790834903717041, - "kl": 2.75390625, - "learning_rate": 8.790908839072262e-07, - "loss": 0.2092, - "num_tokens": 556894331.0, - "reward": 0.984375, - "reward_std": 0.20769117772579193, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.16913031041622162, + "grad_norm": 141.9339599609375, + "kl": 4.828125, + "learning_rate": 8.793691459607097e-07, + "loss": 0.3184, + "num_tokens": 617229777.0, + "reward": 0.98486328125, + "reward_std": 0.2511994540691376, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91064453125, + "rewards/tag_count_reward/std": 0.21145978569984436, "step": 923 }, { @@ -26782,27 +26782,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 790.607421875, - "completions/mean_terminated_length": 752.6578979492188, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 823.48046875, + "completions/mean_terminated_length": 796.5947875976562, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, "epoch": 0.3154391055730989, - "grad_norm": 4.157425880432129, - "kl": 3.03515625, - "learning_rate": 8.787250023462286e-07, - "loss": 0.2037, - "num_tokens": 557371538.0, - "reward": 0.95654296875, - "reward_std": 0.19840088486671448, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.17876482009887695, + "grad_norm": 3.438264846801758, + "kl": 2.80078125, + "learning_rate": 8.790034812260915e-07, + "loss": 0.2291, + "num_tokens": 617723815.0, + "reward": 0.94091796875, + "reward_std": 0.22740837931632996, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.2029850035905838, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89794921875, + "rewards/tag_count_reward/std": 0.22548207640647888, "step": 924 }, { @@ -26811,56 +26811,56 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 772.291015625, - "completions/mean_terminated_length": 749.4651489257812, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 864.822265625, + "completions/mean_terminated_length": 831.5601806640625, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, "epoch": 0.3157804898864897, - "grad_norm": 1.3573524951934814, - "kl": 2.732421875, - "learning_rate": 8.783586542202148e-07, - "loss": 0.1264, - "num_tokens": 557839879.0, - "reward": 0.953125, - "reward_std": 0.24137933552265167, - "rewards/accuracy_reward/mean": 0.03427419438958168, - "rewards/accuracy_reward/std": 0.18211629986763, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.19289569556713104, - "step": 925 - }, - { - "clip_ratio/high_max": 0.0, - "clip_ratio/high_mean": 0.0, + "grad_norm": 5.8524274826049805, + "kl": 3.36328125, + "learning_rate": 8.786373491768456e-07, + "loss": 0.3191, + "num_tokens": 618239532.0, + "reward": 0.92138671875, + "reward_std": 0.2730148136615753, + "rewards/accuracy_reward/mean": 0.0463709682226181, + "rewards/accuracy_reward/std": 0.21049949526786804, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87646484375, + "rewards/tag_count_reward/std": 0.24330125749111176, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1916.0, - "completions/mean_length": 765.982421875, - "completions/mean_terminated_length": 743.043701171875, + "completions/max_terminated_length": 1933.0, + "completions/mean_length": 813.607421875, + "completions/mean_terminated_length": 791.5208129882812, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.3161218741998805, - "grad_norm": 1.8128553628921509, - "kl": 3.16015625, - "learning_rate": 8.779918400491488e-07, - "loss": 0.1972, - "num_tokens": 558313566.0, - "reward": 1.04443359375, - "reward_std": 0.29815906286239624, - "rewards/accuracy_reward/mean": 0.12109375, - "rewards/accuracy_reward/std": 0.3265552520751953, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.1865541785955429, + "grad_norm": 7.005990982055664, + "kl": 2.978515625, + "learning_rate": 8.782707503330235e-07, + "loss": 0.2882, + "num_tokens": 618737603.0, + "reward": 1.03125, + "reward_std": 0.27408674359321594, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.21501831710338593, "step": 926 }, { @@ -26869,27 +26869,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 746.015625, - "completions/mean_terminated_length": 722.7196655273438, - "completions/min_length": 237.0, - "completions/min_terminated_length": 237.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 853.650390625, + "completions/mean_terminated_length": 824.9860229492188, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.31646325851327134, - "grad_norm": 2.1845901012420654, - "kl": 3.37890625, - "learning_rate": 8.776245603536565e-07, - "loss": 0.2325, - "num_tokens": 558781126.0, - "reward": 0.99365234375, - "reward_std": 0.2560039162635803, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.17876482009887695, + "grad_norm": 8.662222862243652, + "kl": 3.87890625, + "learning_rate": 8.779036852153406e-07, + "loss": 0.383, + "num_tokens": 619260272.0, + "reward": 0.97021484375, + "reward_std": 0.3179694414138794, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87255859375, + "rewards/tag_count_reward/std": 0.2397485226392746, "step": 927 }, { @@ -26898,27 +26898,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1978.0, - "completions/mean_length": 767.728515625, - "completions/mean_terminated_length": 734.374755859375, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 841.900390625, + "completions/mean_terminated_length": 800.4788208007812, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.3168046428266621, - "grad_norm": 3.265126943588257, - "kl": 3.8828125, - "learning_rate": 8.772568156550241e-07, - "loss": 0.2827, - "num_tokens": 559256299.0, - "reward": 1.0185546875, - "reward_std": 0.2517598271369934, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.16878816485404968, + "grad_norm": 3.5595650672912598, + "kl": 5.01953125, + "learning_rate": 8.775361543451735e-07, + "loss": 0.4295, + "num_tokens": 619773421.0, + "reward": 0.9375, + "reward_std": 0.307569682598114, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.24766522645950317, "step": 928 }, { @@ -26927,27 +26927,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 1940.0, - "completions/max_terminated_length": 1940.0, - "completions/mean_length": 716.318359375, - "completions/mean_terminated_length": 716.318359375, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 870.712890625, + "completions/mean_terminated_length": 825.3407592773438, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, "epoch": 0.3171460271400529, - "grad_norm": 1.339988112449646, - "kl": 3.326171875, - "learning_rate": 8.76888606475198e-07, - "loss": 0.1828, - "num_tokens": 559695934.0, - "reward": 0.9970703125, - "reward_std": 0.2440335899591446, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.16588735580444336, + "grad_norm": 5.622350215911865, + "kl": 5.53125, + "learning_rate": 8.771681582445612e-07, + "loss": 0.4481, + "num_tokens": 620292106.0, + "reward": 0.9384765625, + "reward_std": 0.31713542342185974, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8603515625, + "rewards/tag_count_reward/std": 0.2512790560722351, "step": 929 }, { @@ -26956,27 +26956,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 748.62109375, - "completions/mean_terminated_length": 717.43603515625, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 874.041015625, + "completions/mean_terminated_length": 808.6866455078125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.3174874114534437, - "grad_norm": 6.6354146003723145, - "kl": 6.7890625, - "learning_rate": 8.765199333367837e-07, - "loss": 0.3981, - "num_tokens": 560155068.0, - "reward": 0.97216796875, - "reward_std": 0.25777316093444824, - "rewards/accuracy_reward/mean": 0.052419353276491165, - "rewards/accuracy_reward/std": 0.22309619188308716, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.19715769588947296, + "grad_norm": 6.712055683135986, + "kl": 6.609375, + "learning_rate": 8.767996974362034e-07, + "loss": 0.5272, + "num_tokens": 620815455.0, + "reward": 0.86083984375, + "reward_std": 0.271533727645874, + "rewards/accuracy_reward/mean": 0.024193547666072845, + "rewards/accuracy_reward/std": 0.15380479395389557, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83740234375, + "rewards/tag_count_reward/std": 0.27144336700439453, "step": 930 }, { @@ -26985,27 +26985,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1979.0, - "completions/mean_length": 694.666015625, - "completions/mean_terminated_length": 659.4088134765625, - "completions/min_length": 62.0, - "completions/min_terminated_length": 62.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 820.7421875, + "completions/mean_terminated_length": 755.08642578125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, "epoch": 0.31782879576683454, - "grad_norm": 8.99532699584961, - "kl": 8.1484375, - "learning_rate": 8.761507967630453e-07, - "loss": 0.4642, - "num_tokens": 560595809.0, - "reward": 1.0048828125, - "reward_std": 0.27473604679107666, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.19796842336654663, + "grad_norm": 6.713479518890381, + "kl": 6.515625, + "learning_rate": 8.764307724434592e-07, + "loss": 0.5543, + "num_tokens": 621320747.0, + "reward": 0.95849609375, + "reward_std": 0.32186347246170044, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86083984375, + "rewards/tag_count_reward/std": 0.2599199712276459, "step": 931 }, { @@ -27014,27 +27014,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 732.279296875, - "completions/mean_terminated_length": 706.0697631835938, - "completions/min_length": 67.0, - "completions/min_terminated_length": 67.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 869.896484375, + "completions/mean_terminated_length": 811.9569091796875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.3181701800802253, - "grad_norm": 8.550848007202148, - "kl": 7.4921875, - "learning_rate": 8.757811972779048e-07, - "loss": 0.4005, - "num_tokens": 561045760.0, - "reward": 0.998046875, - "reward_std": 0.2719319462776184, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.18775463104248047, + "grad_norm": 5.737715721130371, + "kl": 5.890625, + "learning_rate": 8.76061383790348e-07, + "loss": 0.4572, + "num_tokens": 621841158.0, + "reward": 0.93359375, + "reward_std": 0.33846548199653625, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.830078125, + "rewards/tag_count_reward/std": 0.26713964343070984, "step": 932 }, { @@ -27043,27 +27043,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1938.0, - "completions/mean_length": 676.158203125, - "completions/mean_terminated_length": 659.891357421875, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 830.978515625, + "completions/mean_terminated_length": 763.226806640625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, "epoch": 0.3185115643936161, - "grad_norm": 8.880213737487793, - "kl": 6.3828125, - "learning_rate": 8.754111354059409e-07, - "loss": 0.3314, - "num_tokens": 561467201.0, - "reward": 0.99755859375, - "reward_std": 0.267604798078537, + "grad_norm": 5.424788951873779, + "kl": 5.7578125, + "learning_rate": 8.75691532001547e-07, + "loss": 0.4405, + "num_tokens": 622341867.0, + "reward": 0.90283203125, + "reward_std": 0.2899892330169678, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.17655426263809204, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83642578125, + "rewards/tag_count_reward/std": 0.26949673891067505, "step": 933 }, { @@ -27072,27 +27072,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1937.0, - "completions/mean_length": 824.904296875, - "completions/mean_terminated_length": 790.5200805664062, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 897.818359375, + "completions/mean_terminated_length": 851.06298828125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.3188529487070069, - "grad_norm": 7.950473785400391, - "kl": 6.78125, - "learning_rate": 8.750406116723889e-07, - "loss": 0.3817, - "num_tokens": 561966576.0, - "reward": 0.9892578125, - "reward_std": 0.2762683928012848, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.9033203125, - "rewards/tag_count_reward/std": 0.2007293701171875, + "grad_norm": 3.639519691467285, + "kl": 4.90234375, + "learning_rate": 8.753212176023914e-07, + "loss": 0.3879, + "num_tokens": 622878574.0, + "reward": 0.94580078125, + "reward_std": 0.307527631521225, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.84814453125, + "rewards/tag_count_reward/std": 0.2522141933441162, "step": 934 }, { @@ -27101,27 +27101,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 734.10546875, - "completions/mean_terminated_length": 705.2575073242188, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 1870.0, + "completions/mean_length": 842.275390625, + "completions/mean_terminated_length": 772.522705078125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, "epoch": 0.31919433302039774, - "grad_norm": 1.5691170692443848, - "kl": 4.28125, - "learning_rate": 8.746696266031392e-07, - "loss": 0.3072, - "num_tokens": 562428502.0, - "reward": 1.076171875, - "reward_std": 0.26259198784828186, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.15785017609596252, + "grad_norm": 3.956763982772827, + "kl": 4.8984375, + "learning_rate": 8.74950441118874e-07, + "loss": 0.3829, + "num_tokens": 623395883.0, + "reward": 0.99072265625, + "reward_std": 0.3071928322315216, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87548828125, + "rewards/tag_count_reward/std": 0.2287970632314682, "step": 935 }, { @@ -27130,27 +27130,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 815.935546875, - "completions/mean_terminated_length": 783.8377075195312, - "completions/min_length": 101.0, - "completions/min_terminated_length": 101.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 962.904296875, + "completions/mean_terminated_length": 858.3447265625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.3195357173337885, - "grad_norm": 4.007493495941162, - "kl": 4.50390625, - "learning_rate": 8.742981807247374e-07, - "loss": 0.3032, - "num_tokens": 562928613.0, - "reward": 0.9599609375, - "reward_std": 0.27960240840911865, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.9052734375, - "rewards/tag_count_reward/std": 0.214018315076828, + "grad_norm": 4.665554523468018, + "kl": 5.5234375, + "learning_rate": 8.745792030776433e-07, + "loss": 0.4261, + "num_tokens": 623971242.0, + "reward": 0.85986328125, + "reward_std": 0.30713027715682983, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.80908203125, + "rewards/tag_count_reward/std": 0.2709888517856598, "step": 936 }, { @@ -27159,27 +27159,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1814.0, - "completions/mean_length": 679.859375, - "completions/mean_terminated_length": 658.1428833007812, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 877.078125, + "completions/mean_terminated_length": 814.4362182617188, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.3198771016471793, - "grad_norm": 5.114379405975342, - "kl": 2.4375, - "learning_rate": 8.739262745643832e-07, - "loss": 0.1968, - "num_tokens": 563355053.0, - "reward": 1.02978515625, - "reward_std": 0.23758836090564728, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.94580078125, - "rewards/tag_count_reward/std": 0.1503971964120865, + "grad_norm": 7.224167823791504, + "kl": 4.2421875, + "learning_rate": 8.742075040060037e-07, + "loss": 0.3629, + "num_tokens": 624498658.0, + "reward": 0.9658203125, + "reward_std": 0.2918092608451843, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8759765625, + "rewards/tag_count_reward/std": 0.2303936630487442, "step": 937 }, { @@ -27188,27 +27188,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1886.0, - "completions/mean_length": 752.609375, - "completions/mean_terminated_length": 713.5130615234375, - "completions/min_length": 207.0, - "completions/min_terminated_length": 207.0, + "completions/max_terminated_length": 1932.0, + "completions/mean_length": 912.990234375, + "completions/mean_terminated_length": 847.3284912109375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.3202184859605701, - "grad_norm": 8.205772399902344, - "kl": 2.47265625, - "learning_rate": 8.735539086499291e-07, - "loss": 0.2062, - "num_tokens": 563814917.0, - "reward": 1.0048828125, - "reward_std": 0.25415509939193726, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.1731034219264984, + "grad_norm": 8.428768157958984, + "kl": 4.12890625, + "learning_rate": 8.738353444319146e-07, + "loss": 0.3452, + "num_tokens": 625040637.0, + "reward": 0.96240234375, + "reward_std": 0.32676878571510315, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86669921875, + "rewards/tag_count_reward/std": 0.23705102503299713, "step": 938 }, { @@ -27217,27 +27217,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1952.0, - "completions/mean_length": 788.453125, - "completions/mean_terminated_length": 760.7984008789062, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 999.67578125, + "completions/mean_terminated_length": 901.1154174804688, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.32055987027396093, - "grad_norm": 6.041194438934326, - "kl": 2.4609375, - "learning_rate": 8.731810835098805e-07, - "loss": 0.1909, - "num_tokens": 564293165.0, - "reward": 0.98046875, - "reward_std": 0.2068929672241211, + "grad_norm": 11.9134521484375, + "kl": 4.171875, + "learning_rate": 8.734627248839889e-07, + "loss": 0.393, + "num_tokens": 625627031.0, + "reward": 0.89453125, + "reward_std": 0.25608599185943604, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.15945225954055786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.23845018446445465, "step": 939 }, { @@ -27246,27 +27246,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.005859375, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 729.88671875, - "completions/mean_terminated_length": 722.117919921875, - "completions/min_length": 9.0, - "completions/min_terminated_length": 9.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 939.853515625, + "completions/mean_terminated_length": 838.2537231445312, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, "epoch": 0.3209012545873517, - "grad_norm": 5.101868629455566, - "kl": 2.65234375, - "learning_rate": 8.728077996733945e-07, - "loss": 0.2009, - "num_tokens": 564740915.0, - "reward": 1.0068359375, - "reward_std": 0.21976228058338165, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, + "grad_norm": 6.882454872131348, + "kl": 4.91796875, + "learning_rate": 8.730896458914934e-07, + "loss": 0.424, + "num_tokens": 626182284.0, + "reward": 0.94287109375, + "reward_std": 0.2925563454627991, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.9462890625, - "rewards/tag_count_reward/std": 0.150978222489357, + "rewards/tag_count_reward/mean": 0.87451171875, + "rewards/tag_count_reward/std": 0.2314545065164566, "step": 940 }, { @@ -27275,27 +27275,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.1484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 776.39453125, - "completions/mean_terminated_length": 751.0637817382812, - "completions/min_length": 34.0, - "completions/min_terminated_length": 34.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 1052.755859375, + "completions/mean_terminated_length": 879.2728881835938, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.3212426389007425, - "grad_norm": 2.568970203399658, - "kl": 3.15625, - "learning_rate": 8.724340576702791e-07, - "loss": 0.2346, - "num_tokens": 565213661.0, - "reward": 1.02392578125, - "reward_std": 0.2454395741224289, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17037923634052277, + "grad_norm": 2.9978644847869873, + "kl": 6.828125, + "learning_rate": 8.727161079843475e-07, + "loss": 0.5207, + "num_tokens": 626796527.0, + "reward": 0.95849609375, + "reward_std": 0.34301111102104187, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83935546875, + "rewards/tag_count_reward/std": 0.25926846265792847, "step": 941 }, { @@ -27304,27 +27304,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.189453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 860.27734375, - "completions/mean_terminated_length": 824.4305419921875, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1164.68359375, + "completions/mean_terminated_length": 958.2216796875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.3215840232141333, - "grad_norm": 3.625922441482544, - "kl": 4.6875, - "learning_rate": 8.72059858030993e-07, - "loss": 0.2448, - "num_tokens": 565751115.0, - "reward": 0.9619140625, - "reward_std": 0.23727825284004211, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.1926850527524948, + "grad_norm": 9.706525802612305, + "kl": 8.328125, + "learning_rate": 8.723421116931221e-07, + "loss": 0.5975, + "num_tokens": 627489837.0, + "reward": 0.85107421875, + "reward_std": 0.29063451290130615, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.1843547374010086, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.81591796875, + "rewards/tag_count_reward/std": 0.27256399393081665, "step": 942 }, { @@ -27333,27 +27333,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.17578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1892.0, - "completions/mean_length": 714.87109375, - "completions/mean_terminated_length": 699.0632934570312, - "completions/min_length": 40.0, - "completions/min_terminated_length": 40.0, + "completions/max_terminated_length": 1932.0, + "completions/mean_length": 1038.939453125, + "completions/mean_terminated_length": 823.7369995117188, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, "epoch": 0.32192540752752413, - "grad_norm": 4.811487674713135, - "kl": 4.54296875, - "learning_rate": 8.716852012866438e-07, - "loss": 0.2542, - "num_tokens": 566192329.0, - "reward": 1.05224609375, - "reward_std": 0.2621305286884308, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.16196657717227936, + "grad_norm": 4.725491523742676, + "kl": 9.28125, + "learning_rate": 8.719676575490393e-07, + "loss": 0.7163, + "num_tokens": 628096974.0, + "reward": 0.94140625, + "reward_std": 0.35627156496047974, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.826171875, + "rewards/tag_count_reward/std": 0.27724990248680115, "step": 943 }, { @@ -27362,27 +27362,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.142578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 747.833984375, - "completions/mean_terminated_length": 735.0118408203125, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1034.279296875, + "completions/mean_terminated_length": 865.7107543945312, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, "epoch": 0.3222667918409149, - "grad_norm": 3.6192831993103027, - "kl": 4.82421875, - "learning_rate": 8.713100879689886e-07, - "loss": 0.2576, - "num_tokens": 566651476.0, - "reward": 1.0556640625, - "reward_std": 0.27593910694122314, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.17588526010513306, + "grad_norm": 5.5540008544921875, + "kl": 7.6484375, + "learning_rate": 8.715927460839717e-07, + "loss": 0.571, + "num_tokens": 628702781.0, + "reward": 0.95068359375, + "reward_std": 0.3199988305568695, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86083984375, + "rewards/tag_count_reward/std": 0.2566048800945282, "step": 944 }, { @@ -27391,27 +27391,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.14453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1891.0, - "completions/mean_length": 726.2421875, - "completions/mean_terminated_length": 697.2215576171875, - "completions/min_length": 15.0, - "completions/min_terminated_length": 15.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1024.484375, + "completions/mean_terminated_length": 851.5615844726562, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.3226081761543057, - "grad_norm": 4.048362731933594, - "kl": 3.89453125, - "learning_rate": 8.709345186104319e-07, - "loss": 0.1823, - "num_tokens": 567098240.0, - "reward": 1.01416015625, - "reward_std": 0.26617729663848877, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.17600135505199432, + "grad_norm": 7.885499000549316, + "kl": 6.9453125, + "learning_rate": 8.712173778304414e-07, + "loss": 0.4744, + "num_tokens": 629302245.0, + "reward": 0.927734375, + "reward_std": 0.28372901678085327, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.861328125, + "rewards/tag_count_reward/std": 0.2523055970668793, "step": 945 }, { @@ -27420,27 +27420,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.126953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1872.0, - "completions/mean_length": 702.59375, - "completions/mean_terminated_length": 692.0, - "completions/min_length": 83.0, - "completions/min_terminated_length": 83.0, + "completions/max_terminated_length": 1956.0, + "completions/mean_length": 1021.466796875, + "completions/mean_terminated_length": 872.1946411132812, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, "epoch": 0.3229495604676965, - "grad_norm": 3.2535111904144287, - "kl": 3.25, - "learning_rate": 8.705584937440257e-07, - "loss": 0.1517, - "num_tokens": 567544672.0, - "reward": 1.06005859375, - "reward_std": 0.23062613606452942, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.15406294167041779, + "grad_norm": 6.5982666015625, + "kl": 6.265625, + "learning_rate": 8.708415533216192e-07, + "loss": 0.5247, + "num_tokens": 629911940.0, + "reward": 0.98388671875, + "reward_std": 0.3080558776855469, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88037109375, + "rewards/tag_count_reward/std": 0.23661932349205017, "step": 946 }, { @@ -27449,27 +27449,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.001953125, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1906.0, - "completions/mean_length": 737.283203125, - "completions/mean_terminated_length": 734.7182006835938, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 1080.47265625, + "completions/mean_terminated_length": 952.039794921875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, "epoch": 0.32329094478108733, - "grad_norm": 1.3384392261505127, - "kl": 2.703125, - "learning_rate": 8.701820139034686e-07, - "loss": 0.1537, - "num_tokens": 568000337.0, - "reward": 1.052734375, - "reward_std": 0.22163353860378265, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.001953125, - "rewards/format_reward/std": 0.04419417306780815, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.14899368584156036, + "grad_norm": 7.273067951202393, + "kl": 7.3125, + "learning_rate": 8.70465273091324e-07, + "loss": 0.5307, + "num_tokens": 630543318.0, + "reward": 0.99560546875, + "reward_std": 0.3183760643005371, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88037109375, + "rewards/tag_count_reward/std": 0.23765088617801666, "step": 947 }, { @@ -27478,27 +27478,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.130859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1867.0, - "completions/mean_length": 777.642578125, - "completions/mean_terminated_length": 765.1143798828125, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 1075.09765625, + "completions/mean_terminated_length": 928.61572265625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, "epoch": 0.3236323290944781, - "grad_norm": 1.0986766815185547, - "kl": 1.90625, - "learning_rate": 8.698050796231049e-07, - "loss": 0.1157, - "num_tokens": 568467706.0, - "reward": 1.04443359375, - "reward_std": 0.20831140875816345, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.95654296875, - "rewards/tag_count_reward/std": 0.13613753020763397, + "grad_norm": 5.45756721496582, + "kl": 6.7421875, + "learning_rate": 8.700885376740221e-07, + "loss": 0.4869, + "num_tokens": 631162984.0, + "reward": 0.958984375, + "reward_std": 0.27324575185775757, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.2387385219335556, "step": 948 }, { @@ -27507,56 +27507,56 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1897.0, - "completions/mean_length": 773.7734375, - "completions/mean_terminated_length": 758.6640625, - "completions/min_length": 204.0, - "completions/min_terminated_length": 204.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1043.71875, + "completions/mean_terminated_length": 905.3511352539062, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.3239737134078689, - "grad_norm": 3.154078245162964, - "kl": 1.720703125, - "learning_rate": 8.694276914379237e-07, - "loss": 0.1252, - "num_tokens": 568940246.0, - "reward": 1.04638671875, - "reward_std": 0.261174738407135, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.94677734375, - "rewards/tag_count_reward/std": 0.1491146832704544, - "step": 949 - }, - { - "clip_ratio/high_max": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/low_mean": 0.0, + "grad_norm": 4.094376087188721, + "kl": 5.7109375, + "learning_rate": 8.697113476048263e-07, + "loss": 0.4158, + "num_tokens": 631773736.0, + "reward": 1.0166015625, + "reward_std": 0.3208475112915039, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8916015625, + "rewards/tag_count_reward/std": 0.2238643318414688, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 736.962890625, - "completions/mean_terminated_length": 726.6397705078125, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 986.486328125, + "completions/mean_terminated_length": 896.5275268554688, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, "epoch": 0.3243150977212597, - "grad_norm": 1.677687644958496, - "kl": 1.6796875, - "learning_rate": 8.690498498835586e-07, - "loss": 0.107, - "num_tokens": 569403843.0, - "reward": 1.06494140625, - "reward_std": 0.25524967908859253, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.94384765625, - "rewards/tag_count_reward/std": 0.15841138362884521, + "grad_norm": 9.125158309936523, + "kl": 4.4296875, + "learning_rate": 8.693337034194952e-07, + "loss": 0.3925, + "num_tokens": 632365089.0, + "reward": 1.03271484375, + "reward_std": 0.3009680509567261, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.2054828256368637, "step": 950 }, { @@ -27565,27 +27565,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 783.171875, - "completions/mean_terminated_length": 773.2125854492188, - "completions/min_length": 230.0, - "completions/min_terminated_length": 230.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 1051.46484375, + "completions/mean_terminated_length": 960.0980834960938, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.32465648203465053, - "grad_norm": 2.603933334350586, - "kl": 2.146484375, - "learning_rate": 8.686715554962869e-07, - "loss": 0.1315, - "num_tokens": 569888747.0, - "reward": 1.04443359375, - "reward_std": 0.2933083176612854, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.17145265638828278, + "grad_norm": 8.623359680175781, + "kl": 4.4453125, + "learning_rate": 8.689556056544323e-07, + "loss": 0.3687, + "num_tokens": 632987359.0, + "reward": 1.05078125, + "reward_std": 0.3178989291191101, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.2057807892560959, "step": 951 }, { @@ -27594,27 +27594,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.09765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 757.67578125, - "completions/mean_terminated_length": 742.3755493164062, - "completions/min_length": 203.0, - "completions/min_terminated_length": 203.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1030.28125, + "completions/mean_terminated_length": 920.1385498046875, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, "epoch": 0.3249978663480413, - "grad_norm": 1.5893560647964478, - "kl": 3.08984375, - "learning_rate": 8.682928088130278e-07, - "loss": 0.1757, - "num_tokens": 570356213.0, - "reward": 1.04296875, - "reward_std": 0.29115957021713257, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.17468871176242828, + "grad_norm": 5.050754547119141, + "kl": 4.203125, + "learning_rate": 8.685770548466857e-07, + "loss": 0.3522, + "num_tokens": 633594399.0, + "reward": 1.0556640625, + "reward_std": 0.30803218483924866, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.20474500954151154, "step": 952 }, { @@ -27623,27 +27623,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 826.5234375, - "completions/mean_terminated_length": 787.1209716796875, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1040.634765625, + "completions/mean_terminated_length": 909.4326782226562, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.3253392506614321, - "grad_norm": 3.879516839981079, - "kl": 5.0859375, - "learning_rate": 8.679136103713431e-07, - "loss": 0.277, - "num_tokens": 570859249.0, - "reward": 0.9921875, - "reward_std": 0.2740318775177002, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.19659529626369476, + "grad_norm": 5.704651355743408, + "kl": 4.69921875, + "learning_rate": 8.681980515339463e-07, + "loss": 0.3762, + "num_tokens": 634207060.0, + "reward": 0.99658203125, + "reward_std": 0.29545044898986816, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90087890625, + "rewards/tag_count_reward/std": 0.21742625534534454, "step": 953 }, { @@ -27652,27 +27652,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 789.013671875, - "completions/mean_terminated_length": 774.0850219726562, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 1031.1171875, + "completions/mean_terminated_length": 949.5949096679688, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.3256806349748229, - "grad_norm": 2.807542085647583, - "kl": 3.13671875, - "learning_rate": 8.675339607094356e-07, - "loss": 0.148, - "num_tokens": 571337144.0, - "reward": 1.01171875, - "reward_std": 0.2758334279060364, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.017578125, - "rewards/format_reward/std": 0.13154059648513794, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.17043600976467133, + "grad_norm": 7.719779968261719, + "kl": 3.67578125, + "learning_rate": 8.678185962545486e-07, + "loss": 0.3183, + "num_tokens": 634808912.0, + "reward": 1.01025390625, + "reward_std": 0.26246875524520874, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.1820801943540573, "step": 954 }, { @@ -27681,27 +27681,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 771.439453125, - "completions/mean_terminated_length": 751.1766357421875, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 1044.75390625, + "completions/mean_terminated_length": 911.5796508789062, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.3260220192882137, - "grad_norm": 1.9244072437286377, - "kl": 3.5625, - "learning_rate": 8.671538603661489e-07, - "loss": 0.1791, - "num_tokens": 571805017.0, - "reward": 1.056640625, - "reward_std": 0.3241751194000244, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, - "rewards/format_reward/mean": 0.01953125, - "rewards/format_reward/std": 0.1385180652141571, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.17468871176242828, + "grad_norm": 9.021772384643555, + "kl": 6.6875, + "learning_rate": 8.674386895474688e-07, + "loss": 0.4649, + "num_tokens": 635416722.0, + "reward": 1.04736328125, + "reward_std": 0.3167871832847595, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90673828125, + "rewards/tag_count_reward/std": 0.221117302775383, "step": 955 }, { @@ -27710,27 +27710,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 781.017578125, - "completions/mean_terminated_length": 753.1995849609375, - "completions/min_length": 8.0, - "completions/min_terminated_length": 8.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1003.0, + "completions/mean_terminated_length": 879.7904052734375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, "epoch": 0.3263634036016045, - "grad_norm": 2.9450836181640625, - "kl": 4.9140625, - "learning_rate": 8.667733098809655e-07, - "loss": 0.2572, - "num_tokens": 572278274.0, - "reward": 1.05126953125, - "reward_std": 0.36355161666870117, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, - "rewards/format_reward/mean": 0.01953125, - "rewards/format_reward/std": 0.1385180652141571, - "rewards/tag_count_reward/mean": 0.90869140625, - "rewards/tag_count_reward/std": 0.2047327607870102, + "grad_norm": 3.2919187545776367, + "kl": 6.1328125, + "learning_rate": 8.670583319523236e-07, + "loss": 0.4859, + "num_tokens": 636003634.0, + "reward": 1.04345703125, + "reward_std": 0.32933372259140015, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.21972574293613434, "step": 956 }, { @@ -27739,27 +27739,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.09765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 801.27734375, - "completions/mean_terminated_length": 778.9701538085938, - "completions/min_length": 95.0, - "completions/min_terminated_length": 95.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1005.654296875, + "completions/mean_terminated_length": 892.8463134765625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.3267047879149953, - "grad_norm": 4.156304359436035, - "kl": 3.5703125, - "learning_rate": 8.663923097940072e-07, - "loss": 0.2343, - "num_tokens": 572765744.0, - "reward": 1.00732421875, - "reward_std": 0.303446888923645, - "rewards/accuracy_reward/mean": 0.08467742055654526, - "rewards/accuracy_reward/std": 0.278682142496109, - "rewards/format_reward/mean": 0.01171875, - "rewards/format_reward/std": 0.10772226005792618, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.20565944910049438, + "grad_norm": 3.2543132305145264, + "kl": 5.78515625, + "learning_rate": 8.666775240093711e-07, + "loss": 0.4701, + "num_tokens": 636595745.0, + "reward": 1.01806640625, + "reward_std": 0.29099416732788086, + "rewards/accuracy_reward/mean": 0.1088709682226181, + "rewards/accuracy_reward/std": 0.31179171800613403, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.21054047346115112, "step": 957 }, { @@ -27768,27 +27768,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1906.0, - "completions/mean_length": 783.15234375, - "completions/mean_terminated_length": 755.3812255859375, - "completions/min_length": 43.0, - "completions/min_terminated_length": 43.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1010.88671875, + "completions/mean_terminated_length": 886.0700073242188, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.3270461722283861, - "grad_norm": 2.296147584915161, - "kl": 3.46875, - "learning_rate": 8.660108606460343e-07, - "loss": 0.1805, - "num_tokens": 573242446.0, - "reward": 0.9892578125, - "reward_std": 0.2723425030708313, - "rewards/accuracy_reward/mean": 0.05645161122083664, - "rewards/accuracy_reward/std": 0.23102475702762604, - "rewards/format_reward/mean": 0.015625, - "rewards/format_reward/std": 0.12414088100194931, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.19351638853549957, + "grad_norm": 6.471435546875, + "kl": 6.53125, + "learning_rate": 8.662962662595088e-07, + "loss": 0.5, + "num_tokens": 637189047.0, + "reward": 1.00537109375, + "reward_std": 0.26809391379356384, + "rewards/accuracy_reward/mean": 0.09677419066429138, + "rewards/accuracy_reward/std": 0.2959485352039337, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91162109375, + "rewards/tag_count_reward/std": 0.21587374806404114, "step": 958 }, { @@ -27797,27 +27797,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1982.0, - "completions/mean_length": 768.36328125, - "completions/mean_terminated_length": 737.6520385742188, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 997.845703125, + "completions/mean_terminated_length": 868.87939453125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.3273875565417769, - "grad_norm": 1.4120590686798096, - "kl": 2.927734375, - "learning_rate": 8.656289629784439e-07, - "loss": 0.181, - "num_tokens": 573708504.0, - "reward": 1.03125, - "reward_std": 0.25113603472709656, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.01171875, - "rewards/format_reward/std": 0.10772226005792618, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.17051447927951813, + "grad_norm": 6.338634014129639, + "kl": 6.046875, + "learning_rate": 8.659145592442727e-07, + "loss": 0.4275, + "num_tokens": 637772600.0, + "reward": 1.02783203125, + "reward_std": 0.2783639430999756, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.21456845104694366, "step": 959 }, { @@ -27826,27 +27826,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1833.0, - "completions/mean_length": 777.876953125, - "completions/mean_terminated_length": 757.71630859375, - "completions/min_length": 11.0, - "completions/min_terminated_length": 11.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 924.716796875, + "completions/mean_terminated_length": 857.2733154296875, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, "epoch": 0.3277289408551677, - "grad_norm": 1.5171974897384644, - "kl": 2.580078125, - "learning_rate": 8.652466173332698e-07, - "loss": 0.119, - "num_tokens": 574183897.0, - "reward": 1.07568359375, - "reward_std": 0.29082536697387695, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, - "rewards/format_reward/mean": 0.01171875, - "rewards/format_reward/std": 0.10772226005792618, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.15968506038188934, + "grad_norm": 3.816645622253418, + "kl": 3.859375, + "learning_rate": 8.655324035058372e-07, + "loss": 0.3263, + "num_tokens": 638323175.0, + "reward": 1.0966796875, + "reward_std": 0.2696460485458374, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.17860238254070282, "step": 960 }, { @@ -27855,27 +27855,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 846.44921875, - "completions/mean_terminated_length": 812.670654296875, - "completions/min_length": 42.0, - "completions/min_terminated_length": 42.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1026.986328125, + "completions/mean_terminated_length": 904.1072387695312, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, "epoch": 0.3280703251685585, - "grad_norm": 4.278796672821045, - "kl": 3.5703125, - "learning_rate": 8.648638242531817e-07, - "loss": 0.2517, - "num_tokens": 574695887.0, - "reward": 1.046875, - "reward_std": 0.28213781118392944, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.1840227097272873, + "grad_norm": 4.609213829040527, + "kl": 6.3046875, + "learning_rate": 8.651497995870145e-07, + "loss": 0.4766, + "num_tokens": 638927600.0, + "reward": 1.0107421875, + "reward_std": 0.26594099402427673, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9091796875, + "rewards/tag_count_reward/std": 0.22075238823890686, "step": 961 }, { @@ -27884,27 +27884,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1910.0, - "completions/mean_length": 918.970703125, - "completions/mean_terminated_length": 877.83203125, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1094.94921875, + "completions/mean_terminated_length": 963.6400146484375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.3284117094819493, - "grad_norm": 3.791576623916626, - "kl": 3.14453125, - "learning_rate": 8.644805842814846e-07, - "loss": 0.2258, - "num_tokens": 575248672.0, - "reward": 0.97607421875, - "reward_std": 0.2750016450881958, - "rewards/accuracy_reward/mean": 0.04838709533214569, - "rewards/accuracy_reward/std": 0.21479946374893188, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.1965412050485611, + "grad_norm": 9.875836372375488, + "kl": 7.0703125, + "learning_rate": 8.647667480312525e-07, + "loss": 0.4788, + "num_tokens": 639570486.0, + "reward": 0.96435546875, + "reward_std": 0.27260592579841614, + "rewards/accuracy_reward/mean": 0.07056451588869095, + "rewards/accuracy_reward/std": 0.25635460019111633, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89599609375, + "rewards/tag_count_reward/std": 0.23418381810188293, "step": 962 }, { @@ -27913,27 +27913,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 833.83984375, - "completions/mean_terminated_length": 812.1152954101562, - "completions/min_length": 192.0, - "completions/min_terminated_length": 192.0, + "completions/max_terminated_length": 1790.0, + "completions/mean_length": 988.181640625, + "completions/mean_terminated_length": 915.1670532226562, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, "epoch": 0.3287530937953401, - "grad_norm": 2.048088550567627, - "kl": 3.6796875, - "learning_rate": 8.640968979621174e-07, - "loss": 0.2305, - "num_tokens": 575758190.0, - "reward": 1.01220703125, - "reward_std": 0.2654229402542114, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.17530503869056702, + "grad_norm": 3.0637049674987793, + "kl": 4.32421875, + "learning_rate": 8.643832493826357e-07, + "loss": 0.347, + "num_tokens": 640159027.0, + "reward": 1.0615234375, + "reward_std": 0.26371705532073975, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310528099536896, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.16878816485404968, "step": 963 }, { @@ -27942,27 +27942,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1822.0, - "completions/mean_length": 829.357421875, - "completions/mean_terminated_length": 790.0463256835938, - "completions/min_length": 12.0, - "completions/min_terminated_length": 12.0, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 1015.404296875, + "completions/mean_terminated_length": 911.034423828125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.3290944781087309, - "grad_norm": 1.7086501121520996, - "kl": 4.04296875, - "learning_rate": 8.637127658396526e-07, - "loss": 0.2484, - "num_tokens": 576260197.0, - "reward": 1.0205078125, - "reward_std": 0.276422917842865, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.9306640625, - "rewards/tag_count_reward/std": 0.18193122744560242, + "grad_norm": 4.8381876945495605, + "kl": 5.14453125, + "learning_rate": 8.639993041858832e-07, + "loss": 0.3775, + "num_tokens": 640756290.0, + "reward": 1.03076171875, + "reward_std": 0.272873193025589, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.20237624645233154, "step": 964 }, { @@ -27971,27 +27971,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.09765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 882.51953125, - "completions/mean_terminated_length": 832.672119140625, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1043.025390625, + "completions/mean_terminated_length": 934.2619018554688, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, "epoch": 0.3294358624221217, - "grad_norm": 1.9281989336013794, - "kl": 4.921875, - "learning_rate": 8.633281884592957e-07, - "loss": 0.3391, - "num_tokens": 576790463.0, - "reward": 0.98095703125, - "reward_std": 0.24103042483329773, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.009765625, - "rewards/format_reward/std": 0.09843364357948303, + "grad_norm": 3.957658290863037, + "kl": 5.42578125, + "learning_rate": 8.636149129863484e-07, + "loss": 0.3928, + "num_tokens": 641368735.0, + "reward": 0.97705078125, + "reward_std": 0.20857132971286774, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.1815967559814453, + "rewards/tag_count_reward/std": 0.20321024954319, "step": 965 }, { @@ -28000,27 +28000,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1955.0, - "completions/mean_length": 848.255859375, - "completions/mean_terminated_length": 807.0525512695312, - "completions/min_length": 82.0, - "completions/min_terminated_length": 82.0, + "completions/max_terminated_length": 1791.0, + "completions/mean_length": 941.623046875, + "completions/mean_terminated_length": 887.2109985351562, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.3297772467355125, - "grad_norm": 3.4018478393554688, - "kl": 4.98828125, - "learning_rate": 8.629431663668834e-07, - "loss": 0.3018, - "num_tokens": 577315090.0, - "reward": 0.9697265625, - "reward_std": 0.24910764396190643, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.00390625, - "rewards/format_reward/std": 0.06243881583213806, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.19538374245166779, + "grad_norm": 3.6601791381835938, + "kl": 3.296875, + "learning_rate": 8.632300763300187e-07, + "loss": 0.2518, + "num_tokens": 641941166.0, + "reward": 1.005859375, + "reward_std": 0.21847085654735565, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.947265625, + "rewards/tag_count_reward/std": 0.17105160653591156, "step": 966 }, { @@ -28029,27 +28029,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1974.0, - "completions/mean_length": 775.2421875, - "completions/mean_terminated_length": 747.2974243164062, - "completions/min_length": 68.0, - "completions/min_terminated_length": 68.0, + "completions/max_terminated_length": 1951.0, + "completions/mean_length": 937.015625, + "completions/mean_terminated_length": 875.1670532226562, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.3301186310489033, - "grad_norm": 3.637774705886841, - "kl": 4.91015625, - "learning_rate": 8.625577001088848e-07, - "loss": 0.2741, - "num_tokens": 577790062.0, - "reward": 0.96728515625, - "reward_std": 0.24410133063793182, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.18542389571666718, + "grad_norm": 5.90120267868042, + "kl": 3.19140625, + "learning_rate": 8.628447947635135e-07, + "loss": 0.2521, + "num_tokens": 642498966.0, + "reward": 1.02587890625, + "reward_std": 0.2575606107711792, + "rewards/accuracy_reward/mean": 0.08870967477560043, + "rewards/accuracy_reward/std": 0.284611314535141, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.1718478947877884, "step": 967 }, { @@ -28058,27 +28058,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 789.330078125, - "completions/mean_terminated_length": 751.342041015625, - "completions/min_length": 197.0, - "completions/min_terminated_length": 197.0, + "completions/max_terminated_length": 1906.0, + "completions/mean_length": 906.705078125, + "completions/mean_terminated_length": 848.1170654296875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.3304600153622941, - "grad_norm": 7.181121349334717, - "kl": 5.7734375, - "learning_rate": 8.621717902323987e-07, - "loss": 0.3185, - "num_tokens": 578268343.0, - "reward": 1.02001953125, - "reward_std": 0.29213935136795044, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.013671875, - "rewards/format_reward/std": 0.1162383034825325, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.18625172972679138, + "grad_norm": 7.9917731285095215, + "kl": 3.01171875, + "learning_rate": 8.624590688340846e-07, + "loss": 0.2606, + "num_tokens": 643037343.0, + "reward": 1.06982421875, + "reward_std": 0.24564072489738464, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94482421875, + "rewards/tag_count_reward/std": 0.16628073155879974, "step": 968 }, { @@ -28087,27 +28087,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 758.626953125, - "completions/mean_terminated_length": 745.9112548828125, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 863.822265625, + "completions/mean_terminated_length": 815.6849365234375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.3308013996756849, - "grad_norm": 3.7928194999694824, - "kl": 4.27734375, - "learning_rate": 8.61785437285153e-07, - "loss": 0.2576, - "num_tokens": 578731752.0, - "reward": 1.02783203125, - "reward_std": 0.282349556684494, - "rewards/accuracy_reward/mean": 0.09879032522439957, - "rewards/accuracy_reward/std": 0.2986815273761749, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.16716337203979492, + "grad_norm": 4.488840103149414, + "kl": 4.3046875, + "learning_rate": 8.62072899089615e-07, + "loss": 0.3175, + "num_tokens": 643554612.0, + "reward": 1.0673828125, + "reward_std": 0.26376453042030334, + "rewards/accuracy_reward/mean": 0.1270161271095276, + "rewards/accuracy_reward/std": 0.33332720398902893, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.16426658630371094, "step": 969 }, { @@ -28116,27 +28116,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 755.302734375, - "completions/mean_terminated_length": 737.3841552734375, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 880.435546875, + "completions/mean_terminated_length": 807.765625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, "epoch": 0.3311427839890757, - "grad_norm": 2.746143341064453, - "kl": 3.6484375, - "learning_rate": 8.613986418155055e-07, - "loss": 0.1963, - "num_tokens": 579199811.0, - "reward": 1.00537109375, - "reward_std": 0.24694019556045532, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.1755392700433731, + "grad_norm": 4.282420635223389, + "kl": 4.484375, + "learning_rate": 8.616862860786177e-07, + "loss": 0.3145, + "num_tokens": 644086739.0, + "reward": 1.064453125, + "reward_std": 0.24224328994750977, + "rewards/accuracy_reward/mean": 0.1270161271095276, + "rewards/accuracy_reward/std": 0.33332720398902893, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.17621363699436188, "step": 970 }, { @@ -28145,27 +28145,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 852.048828125, - "completions/mean_terminated_length": 795.7975463867188, - "completions/min_length": 56.0, - "completions/min_terminated_length": 56.0, + "completions/max_terminated_length": 1795.0, + "completions/mean_length": 939.048828125, + "completions/mean_terminated_length": 874.8945922851562, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.3314841683024665, - "grad_norm": 4.72691535949707, - "kl": 2.98828125, - "learning_rate": 8.610114043724416e-07, - "loss": 0.2596, - "num_tokens": 579727212.0, - "reward": 0.97314453125, - "reward_std": 0.25940871238708496, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.1923593431711197, + "grad_norm": 3.5683434009552, + "kl": 3.62890625, + "learning_rate": 8.612992303502358e-07, + "loss": 0.3157, + "num_tokens": 644658684.0, + "reward": 1.00439453125, + "reward_std": 0.22754546999931335, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.16826865077018738, "step": 971 }, { @@ -28174,27 +28174,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1968.0, - "completions/mean_length": 878.787109375, - "completions/mean_terminated_length": 826.2918090820312, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 960.90625, + "completions/mean_terminated_length": 895.6356201171875, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, "epoch": 0.3318255526158573, - "grad_norm": 9.027946472167969, - "kl": 3.7265625, - "learning_rate": 8.606237255055738e-07, - "loss": 0.3199, - "num_tokens": 580252207.0, - "reward": 0.99462890625, - "reward_std": 0.2623327970504761, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.18821148574352264, + "grad_norm": 5.6443376541137695, + "kl": 4.2578125, + "learning_rate": 8.609117324542409e-07, + "loss": 0.3348, + "num_tokens": 645225724.0, + "reward": 1.00390625, + "reward_std": 0.26830559968948364, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.1972164362668991, "step": 972 }, { @@ -28203,27 +28203,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 856.70703125, - "completions/mean_terminated_length": 800.6748046875, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1006.583984375, + "completions/mean_terminated_length": 925.463134765625, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, "epoch": 0.3321669369292481, - "grad_norm": 5.184525012969971, - "kl": 3.69140625, - "learning_rate": 8.602356057651416e-07, - "loss": 0.2952, - "num_tokens": 580769081.0, - "reward": 1.0419921875, - "reward_std": 0.31056830286979675, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.19321990013122559, + "grad_norm": 4.892645835876465, + "kl": 5.0703125, + "learning_rate": 8.605237929410326e-07, + "loss": 0.375, + "num_tokens": 645819335.0, + "reward": 1.05712890625, + "reward_std": 0.3197452425956726, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92236328125, + "rewards/tag_count_reward/std": 0.19334039092063904, "step": 973 }, { @@ -28232,27 +28232,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 853.109375, - "completions/mean_terminated_length": 819.51806640625, - "completions/min_length": 46.0, - "completions/min_terminated_length": 46.0, + "completions/max_terminated_length": 1869.0, + "completions/mean_length": 1016.626953125, + "completions/mean_terminated_length": 924.461669921875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.3325083212426389, - "grad_norm": 5.471117973327637, - "kl": 3.291015625, - "learning_rate": 8.598470457020101e-07, - "loss": 0.2522, - "num_tokens": 581289521.0, - "reward": 0.94287109375, - "reward_std": 0.1962069571018219, - "rewards/accuracy_reward/mean": 0.01171875, - "rewards/accuracy_reward/std": 0.10772226005792618, - "rewards/format_reward/mean": 0.005859375, - "rewards/format_reward/std": 0.07639661431312561, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.18349166214466095, + "grad_norm": 3.3567278385162354, + "kl": 5.4453125, + "learning_rate": 8.601354123616382e-07, + "loss": 0.4061, + "num_tokens": 646423496.0, + "reward": 0.97265625, + "reward_std": 0.23431171476840973, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.2057807892560959, "step": 974 }, { @@ -28261,27 +28261,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 789.197265625, - "completions/mean_terminated_length": 761.5588989257812, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 1932.0, + "completions/mean_length": 932.431640625, + "completions/mean_terminated_length": 842.9978637695312, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, "epoch": 0.3328497055560297, - "grad_norm": 3.1377015113830566, - "kl": 4.31640625, - "learning_rate": 8.594580458676688e-07, - "loss": 0.3071, - "num_tokens": 581767174.0, - "reward": 1.033203125, - "reward_std": 0.2888753414154053, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.0078125, - "rewards/format_reward/std": 0.08812850713729858, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.18796826899051666, + "grad_norm": 4.575897216796875, + "kl": 4.5390625, + "learning_rate": 8.597465912677112e-07, + "loss": 0.3606, + "num_tokens": 646974485.0, + "reward": 1.0439453125, + "reward_std": 0.2848871350288391, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.20593629777431488, "step": 975 }, { @@ -28290,27 +28290,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 867.1953125, - "completions/mean_terminated_length": 804.024658203125, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 984.625, + "completions/mean_terminated_length": 899.37548828125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.3331910898694205, - "grad_norm": 6.757725715637207, - "kl": 6.0546875, - "learning_rate": 8.59068606814232e-07, - "loss": 0.3132, - "num_tokens": 582291642.0, - "reward": 1.03955078125, - "reward_std": 0.30651384592056274, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, - "rewards/format_reward/mean": 0.01953125, - "rewards/format_reward/std": 0.1385180652141571, - "rewards/tag_count_reward/mean": 0.90283203125, - "rewards/tag_count_reward/std": 0.2044198215007782, + "grad_norm": 2.637932538986206, + "kl": 5.04296875, + "learning_rate": 8.593573302115306e-07, + "loss": 0.3697, + "num_tokens": 647559077.0, + "reward": 1.05322265625, + "reward_std": 0.3172381818294525, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90869140625, + "rewards/tag_count_reward/std": 0.2106221467256546, "step": 976 }, { @@ -28319,27 +28319,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 893.42578125, - "completions/mean_terminated_length": 848.9290161132812, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1006.21875, + "completions/mean_terminated_length": 910.7036743164062, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.3335324741828113, - "grad_norm": 4.673368453979492, - "kl": 7.1875, - "learning_rate": 8.586787290944373e-07, - "loss": 0.4068, - "num_tokens": 582821284.0, - "reward": 1.064453125, - "reward_std": 0.33829671144485474, - "rewards/accuracy_reward/mean": 0.150390625, - "rewards/accuracy_reward/std": 0.35780346393585205, - "rewards/format_reward/mean": 0.013671875, - "rewards/format_reward/std": 0.1162383034825325, - "rewards/tag_count_reward/mean": 0.900390625, - "rewards/tag_count_reward/std": 0.21579019725322723, + "grad_norm": 2.652325391769409, + "kl": 4.44921875, + "learning_rate": 8.589676297460005e-07, + "loss": 0.349, + "num_tokens": 648146469.0, + "reward": 1.07373046875, + "reward_std": 0.30008286237716675, + "rewards/accuracy_reward/mean": 0.166015625, + "rewards/accuracy_reward/std": 0.3724585771560669, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.21077634394168854, "step": 977 }, { @@ -28348,27 +28348,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 807.15234375, - "completions/mean_terminated_length": 748.7893676757812, - "completions/min_length": 82.0, - "completions/min_terminated_length": 82.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 958.44140625, + "completions/mean_terminated_length": 824.635986328125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.3338738584962021, - "grad_norm": 6.821012020111084, - "kl": 6.984375, - "learning_rate": 8.582884132616448e-07, - "loss": 0.3878, - "num_tokens": 583314690.0, - "reward": 0.9775390625, - "reward_std": 0.3012595772743225, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.02734375, - "rewards/format_reward/std": 0.16324250400066376, - "rewards/tag_count_reward/mean": 0.8994140625, - "rewards/tag_count_reward/std": 0.21871723234653473, + "grad_norm": 3.1995010375976562, + "kl": 5.953125, + "learning_rate": 8.585774904246495e-07, + "loss": 0.4727, + "num_tokens": 648717335.0, + "reward": 0.9658203125, + "reward_std": 0.2517923414707184, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8955078125, + "rewards/tag_count_reward/std": 0.2331804633140564, "step": 978 }, { @@ -28377,27 +28377,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 824.62890625, - "completions/mean_terminated_length": 767.087890625, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 986.60546875, + "completions/mean_terminated_length": 876.8060302734375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.3342152428095929, - "grad_norm": 2.729808807373047, - "kl": 7.046875, - "learning_rate": 8.578976598698364e-07, - "loss": 0.4281, - "num_tokens": 583807972.0, - "reward": 1.00146484375, - "reward_std": 0.33582258224487305, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.0234375, - "rewards/format_reward/std": 0.15143637359142303, - "rewards/tag_count_reward/mean": 0.89599609375, - "rewards/tag_count_reward/std": 0.21851344406604767, + "grad_norm": 7.902650356292725, + "kl": 4.76171875, + "learning_rate": 8.581869128016289e-07, + "loss": 0.432, + "num_tokens": 649293549.0, + "reward": 1.03076171875, + "reward_std": 0.24770037829875946, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.21874944865703583, "step": 979 }, { @@ -28406,27 +28406,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.103515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 871.8984375, - "completions/mean_terminated_length": 806.4247436523438, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1027.525390625, + "completions/mean_terminated_length": 909.6928100585938, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, "epoch": 0.3345566271229837, - "grad_norm": 4.520415306091309, - "kl": 7.375, - "learning_rate": 8.57506469473615e-07, - "loss": 0.4489, - "num_tokens": 584326240.0, - "reward": 0.94970703125, - "reward_std": 0.31445467472076416, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.01953125, - "rewards/format_reward/std": 0.1385180652141571, - "rewards/tag_count_reward/mean": 0.88525390625, - "rewards/tag_count_reward/std": 0.226416677236557, + "grad_norm": 4.225567817687988, + "kl": 5.3515625, + "learning_rate": 8.577958974317131e-07, + "loss": 0.4409, + "num_tokens": 649891498.0, + "reward": 0.96875, + "reward_std": 0.2651587724685669, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.888671875, + "rewards/tag_count_reward/std": 0.22892284393310547, "step": 980 }, { @@ -28435,27 +28435,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 902.8515625, - "completions/mean_terminated_length": 851.4367065429688, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 977.28515625, + "completions/mean_terminated_length": 858.8330078125, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, "epoch": 0.3348980114363745, - "grad_norm": 2.7762203216552734, - "kl": 5.7890625, - "learning_rate": 8.57114842628204e-07, - "loss": 0.3659, - "num_tokens": 584862580.0, - "reward": 1.01416015625, - "reward_std": 0.3422966003417969, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.02734375, - "rewards/format_reward/std": 0.16324250400066376, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.20741750299930573, + "grad_norm": 2.766495704650879, + "kl": 5.6796875, + "learning_rate": 8.57404444870298e-07, + "loss": 0.4678, + "num_tokens": 650465948.0, + "reward": 1.02880859375, + "reward_std": 0.281044602394104, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.20397058129310608, "step": 981 }, { @@ -28464,27 +28464,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 869.392578125, - "completions/mean_terminated_length": 801.2086181640625, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 966.51171875, + "completions/mean_terminated_length": 817.5067138671875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, "epoch": 0.3352393957497653, - "grad_norm": 3.293605327606201, - "kl": 4.46875, - "learning_rate": 8.567227798894458e-07, - "loss": 0.2956, - "num_tokens": 585385453.0, - "reward": 0.99560546875, - "reward_std": 0.32971763610839844, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.03515625, - "rewards/format_reward/std": 0.1843547374010086, - "rewards/tag_count_reward/mean": 0.89599609375, - "rewards/tag_count_reward/std": 0.21963004767894745, + "grad_norm": 3.7221031188964844, + "kl": 6.3671875, + "learning_rate": 8.570125556734003e-07, + "loss": 0.4943, + "num_tokens": 651038546.0, + "reward": 0.96923828125, + "reward_std": 0.2601845860481262, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89111328125, + "rewards/tag_count_reward/std": 0.22769182920455933, "step": 982 }, { @@ -28493,27 +28493,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.06640625, + "completions/clipped_ratio": 0.126953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 908.6015625, - "completions/mean_terminated_length": 827.5564575195312, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 1004.232421875, + "completions/mean_terminated_length": 852.4541625976562, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.3355807800631561, - "grad_norm": 9.126565933227539, - "kl": 4.265625, - "learning_rate": 8.56330281813802e-07, - "loss": 0.3588, - "num_tokens": 585937281.0, - "reward": 1.03515625, - "reward_std": 0.31757861375808716, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, - "rewards/format_reward/mean": 0.01953125, - "rewards/format_reward/std": 0.1385180652141571, - "rewards/tag_count_reward/mean": 0.896484375, - "rewards/tag_count_reward/std": 0.2173432856798172, + "grad_norm": 2.671978712081909, + "kl": 6.5234375, + "learning_rate": 8.566202303976576e-07, + "loss": 0.5166, + "num_tokens": 651639337.0, + "reward": 1.01904296875, + "reward_std": 0.3248867094516754, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89208984375, + "rewards/tag_count_reward/std": 0.22654324769973755, "step": 983 }, { @@ -28522,27 +28522,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 863.005859375, - "completions/mean_terminated_length": 786.6340942382812, - "completions/min_length": 22.0, - "completions/min_terminated_length": 22.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 942.626953125, + "completions/mean_terminated_length": 828.2780151367188, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, "epoch": 0.3359221643765469, - "grad_norm": 6.865476131439209, - "kl": 3.37890625, - "learning_rate": 8.559373489583518e-07, - "loss": 0.28, - "num_tokens": 586456100.0, - "reward": 1.07373046875, - "reward_std": 0.38225170969963074, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, - "rewards/format_reward/mean": 0.03125, - "rewards/format_reward/std": 0.17416280508041382, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.21674399077892303, + "grad_norm": 2.9651739597320557, + "kl": 5.4453125, + "learning_rate": 8.562274696003261e-07, + "loss": 0.4581, + "num_tokens": 652198922.0, + "reward": 1.052734375, + "reward_std": 0.2847357988357544, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.21326914429664612, "step": 984 }, { @@ -28551,27 +28551,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.072265625, + "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 924.205078125, - "completions/mean_terminated_length": 836.6673583984375, - "completions/min_length": 38.0, - "completions/min_terminated_length": 38.0, + "completions/max_terminated_length": 1929.0, + "completions/mean_length": 1007.3671875, + "completions/mean_terminated_length": 869.2301025390625, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, "epoch": 0.3362635486899377, - "grad_norm": 7.508963108062744, - "kl": 3.328125, - "learning_rate": 8.555439818807914e-07, - "loss": 0.3158, - "num_tokens": 587018125.0, - "reward": 1.04296875, - "reward_std": 0.34603816270828247, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, - "rewards/format_reward/mean": 0.013671875, - "rewards/format_reward/std": 0.1162383034825325, - "rewards/tag_count_reward/mean": 0.90234375, - "rewards/tag_count_reward/std": 0.21326914429664612, + "grad_norm": 3.3883941173553467, + "kl": 7.1015625, + "learning_rate": 8.55834273839281e-07, + "loss": 0.5504, + "num_tokens": 652803526.0, + "reward": 1.017578125, + "reward_std": 0.32293081283569336, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.22270244359970093, "step": 985 }, { @@ -28580,27 +28580,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.16796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 914.611328125, - "completions/mean_terminated_length": 858.870849609375, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 1074.025390625, + "completions/mean_terminated_length": 877.4014282226562, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, "epoch": 0.3366049330033285, - "grad_norm": 8.431133270263672, - "kl": 3.2578125, - "learning_rate": 8.551501811394335e-07, - "loss": 0.2678, - "num_tokens": 587567622.0, - "reward": 0.95947265625, - "reward_std": 0.285304456949234, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.015625, - "rewards/format_reward/std": 0.12414088100194931, - "rewards/tag_count_reward/mean": 0.90478515625, - "rewards/tag_count_reward/std": 0.20770753920078278, + "grad_norm": 9.992759704589844, + "kl": 9.5, + "learning_rate": 8.554406436730153e-07, + "loss": 0.6645, + "num_tokens": 653434643.0, + "reward": 0.90771484375, + "reward_std": 0.2561982274055481, + "rewards/accuracy_reward/mean": 0.037109375, + "rewards/accuracy_reward/std": 0.18921469151973724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87060546875, + "rewards/tag_count_reward/std": 0.24326589703559875, "step": 986 }, { @@ -28609,27 +28609,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.1484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 822.861328125, - "completions/mean_terminated_length": 775.64501953125, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/max_terminated_length": 1768.0, + "completions/mean_length": 1008.66015625, + "completions/mean_terminated_length": 827.4907836914062, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.3369463173167193, - "grad_norm": 4.268903732299805, - "kl": 3.21484375, - "learning_rate": 8.547559472932062e-07, - "loss": 0.2195, - "num_tokens": 588068351.0, - "reward": 1.00244140625, - "reward_std": 0.27478182315826416, + "grad_norm": 9.924880981445312, + "kl": 8.4453125, + "learning_rate": 8.550465796606388e-07, + "loss": 0.5947, + "num_tokens": 654030501.0, + "reward": 0.93017578125, + "reward_std": 0.27210235595703125, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.01953125, - "rewards/format_reward/std": 0.1385180652141571, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.19841860234737396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86376953125, + "rewards/tag_count_reward/std": 0.24554912745952606, "step": 987 }, { @@ -28638,27 +28638,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.142578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 869.857421875, - "completions/mean_terminated_length": 836.7369384765625, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 1952.0, + "completions/mean_length": 1061.1640625, + "completions/mean_terminated_length": 897.0661010742188, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, "epoch": 0.33728770163011007, - "grad_norm": 1.8480836153030396, - "kl": 3.3671875, - "learning_rate": 8.543612809016524e-07, - "loss": 0.2163, - "num_tokens": 588590518.0, - "reward": 0.9873046875, - "reward_std": 0.24880146980285645, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.015625, - "rewards/format_reward/std": 0.12414088100194931, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.18428993225097656, + "grad_norm": 10.248553276062012, + "kl": 8.3046875, + "learning_rate": 8.546520823618782e-07, + "loss": 0.606, + "num_tokens": 654650617.0, + "reward": 0.93505859375, + "reward_std": 0.281981885433197, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88037109375, + "rewards/tag_count_reward/std": 0.23139257729053497, "step": 988 }, { @@ -28667,27 +28667,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.134765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 832.720703125, - "completions/mean_terminated_length": 806.0379028320312, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 1908.0, + "completions/mean_length": 1030.5703125, + "completions/mean_terminated_length": 872.099365234375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.3376290859435009, - "grad_norm": 4.212379455566406, - "kl": 4.4296875, - "learning_rate": 8.539661825249287e-07, - "loss": 0.2464, - "num_tokens": 589098391.0, - "reward": 1.0546875, - "reward_std": 0.304060697555542, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.0390625, - "rewards/format_reward/std": 0.1939331740140915, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.1733599156141281, + "grad_norm": 6.171142101287842, + "kl": 6.890625, + "learning_rate": 8.542571523370748e-07, + "loss": 0.5115, + "num_tokens": 655259789.0, + "reward": 0.98046875, + "reward_std": 0.28726279735565186, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.22634544968605042, "step": 989 }, { @@ -28696,27 +28696,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.14453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1922.0, - "completions/mean_length": 838.3984375, - "completions/mean_terminated_length": 806.8858032226562, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1015.796875, + "completions/mean_terminated_length": 841.4063720703125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.3379704702568917, - "grad_norm": 7.339483737945557, - "kl": 5.67578125, - "learning_rate": 8.535706527238051e-07, - "loss": 0.2762, - "num_tokens": 589609843.0, - "reward": 0.984375, - "reward_std": 0.28474554419517517, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.029296875, - "rewards/format_reward/std": 0.16880230605602264, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.19642995297908783, + "grad_norm": 5.066564559936523, + "kl": 7.21875, + "learning_rate": 8.538617901471849e-07, + "loss": 0.5477, + "num_tokens": 655862069.0, + "reward": 0.923828125, + "reward_std": 0.27728980779647827, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.873046875, + "rewards/tag_count_reward/std": 0.2502368688583374, "step": 990 }, { @@ -28725,27 +28725,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.14453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 753.921875, - "completions/mean_terminated_length": 730.767333984375, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/max_terminated_length": 1895.0, + "completions/mean_length": 1009.423828125, + "completions/mean_terminated_length": 833.9566040039062, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.3383118545702825, - "grad_norm": 4.1210618019104, - "kl": 5.7265625, - "learning_rate": 8.531746920596639e-07, - "loss": 0.3426, - "num_tokens": 590078555.0, - "reward": 1.03662109375, - "reward_std": 0.3218595087528229, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.041015625, - "rewards/format_reward/std": 0.19852031767368317, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.1808057427406311, + "grad_norm": 5.284512996673584, + "kl": 5.9296875, + "learning_rate": 8.534659963537787e-07, + "loss": 0.4955, + "num_tokens": 656461598.0, + "reward": 0.9677734375, + "reward_std": 0.3076940178871155, + "rewards/accuracy_reward/mean": 0.08467742055654526, + "rewards/accuracy_reward/std": 0.278682142496109, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8857421875, + "rewards/tag_count_reward/std": 0.22747193276882172, "step": 991 }, { @@ -28754,27 +28754,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 813.31640625, - "completions/mean_terminated_length": 791.224609375, - "completions/min_length": 258.0, - "completions/min_terminated_length": 258.0, + "completions/max_terminated_length": 1884.0, + "completions/mean_length": 1027.67578125, + "completions/mean_terminated_length": 907.3755493164062, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, "epoch": 0.33865323888367327, - "grad_norm": 1.6163721084594727, - "kl": 4.328125, - "learning_rate": 8.527783010944986e-07, - "loss": 0.2373, - "num_tokens": 590577277.0, + "grad_norm": 11.802164077758789, + "kl": 4.40625, + "learning_rate": 8.530697715190395e-07, + "loss": 0.4052, + "num_tokens": 657070072.0, "reward": 1.01953125, - "reward_std": 0.27651625871658325, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.015625, - "rewards/format_reward/std": 0.12414088100194931, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.17446976900100708, + "reward_std": 0.2901967167854309, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.2178088128566742, "step": 992 }, { @@ -28783,27 +28783,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.13671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, - "completions/mean_length": 802.09765625, - "completions/mean_terminated_length": 769.6392822265625, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/mean_length": 991.197265625, + "completions/mean_terminated_length": 823.830322265625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.3389946231970641, - "grad_norm": 3.7197697162628174, - "kl": 5.15625, - "learning_rate": 8.523814803909137e-07, - "loss": 0.3017, - "num_tokens": 591065839.0, - "reward": 1.0087890625, - "reward_std": 0.32167479395866394, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.03125, - "rewards/format_reward/std": 0.17416280508041382, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.19910427927970886, + "grad_norm": 3.6225745677948, + "kl": 5.328125, + "learning_rate": 8.526731162057626e-07, + "loss": 0.4258, + "num_tokens": 657655453.0, + "reward": 0.9619140625, + "reward_std": 0.26162561774253845, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8916015625, + "rewards/tag_count_reward/std": 0.22819331288337708, "step": 993 }, { @@ -28812,56 +28812,56 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 764.3671875, - "completions/mean_terminated_length": 754.2598266601562, - "completions/min_length": 192.0, - "completions/min_terminated_length": 192.0, + "completions/max_terminated_length": 1841.0, + "completions/mean_length": 976.755859375, + "completions/mean_terminated_length": 823.7210083007812, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, "epoch": 0.3393360075104549, - "grad_norm": 2.030200242996216, - "kl": 3.28125, - "learning_rate": 8.51984230512124e-07, - "loss": 0.1716, - "num_tokens": 591533739.0, - "reward": 1.0595703125, - "reward_std": 0.26806777715682983, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.021484375, - "rewards/format_reward/std": 0.14513419568538666, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.16057194769382477, + "grad_norm": 8.34616756439209, + "kl": 4.796875, + "learning_rate": 8.522760309773552e-07, + "loss": 0.4165, + "num_tokens": 658232096.0, + "reward": 1.005859375, + "reward_std": 0.27008605003356934, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.21555092930793762, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1950.0, - "completions/mean_length": 780.6484375, - "completions/mean_terminated_length": 768.14990234375, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, - "epoch": 0.3396773918238457, - "grad_norm": 1.6519471406936646, - "kl": 2.455078125, - "learning_rate": 8.515865520219526e-07, - "loss": 0.1544, - "num_tokens": 592010247.0, - "reward": 1.01318359375, - "reward_std": 0.2568773627281189, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.021484375, - "rewards/format_reward/std": 0.14513419568538666, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.16995809972286224, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.138671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1821.0, + "completions/mean_length": 980.962890625, + "completions/mean_terminated_length": 809.17236328125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.3396773918238457, + "grad_norm": 7.350953578948975, + "kl": 6.2109375, + "learning_rate": 8.518785163978343e-07, + "loss": 0.5384, + "num_tokens": 658811165.0, + "reward": 0.9541015625, + "reward_std": 0.292828232049942, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8837890625, + "rewards/tag_count_reward/std": 0.23806315660476685, "step": 995 }, { @@ -28870,27 +28870,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.123046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 792.646484375, - "completions/mean_terminated_length": 777.7609252929688, - "completions/min_length": 62.0, - "completions/min_terminated_length": 62.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 999.31640625, + "completions/mean_terminated_length": 852.1737670898438, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, "epoch": 0.34001877613723647, - "grad_norm": 0.7317795157432556, - "kl": 1.724609375, - "learning_rate": 8.511884454848315e-07, - "loss": 0.0839, - "num_tokens": 592490946.0, - "reward": 1.09521484375, - "reward_std": 0.31519752740859985, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.03515625, - "rewards/format_reward/std": 0.1843547374010086, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.15166257321834564, + "grad_norm": 5.460068702697754, + "kl": 6.1171875, + "learning_rate": 8.514805730318278e-07, + "loss": 0.5094, + "num_tokens": 659397679.0, + "reward": 1.00244140625, + "reward_std": 0.3263393044471741, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89111328125, + "rewards/tag_count_reward/std": 0.22115187346935272, "step": 996 }, { @@ -28899,27 +28899,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.181640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 770.0546875, - "completions/mean_terminated_length": 754.9012451171875, - "completions/min_length": 269.0, - "completions/min_terminated_length": 269.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 1049.232421875, + "completions/mean_terminated_length": 827.5489501953125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, "epoch": 0.3403601604506273, - "grad_norm": 3.0594396591186523, - "kl": 2.16796875, - "learning_rate": 8.507899114658003e-07, - "loss": 0.1328, - "num_tokens": 592963934.0, - "reward": 1.1103515625, - "reward_std": 0.331748902797699, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, - "rewards/format_reward/mean": 0.033203125, - "rewards/format_reward/std": 0.17934183776378632, - "rewards/tag_count_reward/mean": 0.9462890625, - "rewards/tag_count_reward/std": 0.15654632449150085, + "grad_norm": 9.338483810424805, + "kl": 9.53125, + "learning_rate": 8.510822014445718e-07, + "loss": 0.6749, + "num_tokens": 660013606.0, + "reward": 1.009765625, + "reward_std": 0.39365410804748535, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.26027679443359375, "step": 997 }, { @@ -28928,27 +28928,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.216796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1949.0, - "completions/mean_length": 845.802734375, - "completions/mean_terminated_length": 809.5191040039062, - "completions/min_length": 245.0, - "completions/min_terminated_length": 245.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 1081.2734375, + "completions/mean_terminated_length": 813.6758422851562, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.3407015447640181, - "grad_norm": 2.0634729862213135, - "kl": 2.4453125, - "learning_rate": 8.503909505305048e-07, - "loss": 0.161, - "num_tokens": 593476969.0, - "reward": 1.091796875, - "reward_std": 0.3658602237701416, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.060546875, - "rewards/format_reward/std": 0.2387305200099945, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.16025325655937195, + "grad_norm": 10.63330078125, + "kl": 10.421875, + "learning_rate": 8.506834022019114e-07, + "loss": 0.7504, + "num_tokens": 660647202.0, + "reward": 0.92919921875, + "reward_std": 0.3531253933906555, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.82958984375, + "rewards/tag_count_reward/std": 0.2711474895477295, "step": 998 }, { @@ -28957,27 +28957,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 774.53515625, - "completions/mean_terminated_length": 751.7494506835938, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 1913.0, + "completions/mean_length": 1067.23828125, + "completions/mean_terminated_length": 792.625, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, "epoch": 0.3410429290774089, - "grad_norm": 0.7929698824882507, - "kl": 3.0, - "learning_rate": 8.499915632451975e-07, - "loss": 0.1656, - "num_tokens": 593945531.0, - "reward": 1.0576171875, - "reward_std": 0.3631839156150818, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.064453125, - "rewards/format_reward/std": 0.24579854309558868, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.18174204230308533, + "grad_norm": 25.351045608520508, + "kl": 11.46875, + "learning_rate": 8.502841758702982e-07, + "loss": 0.798, + "num_tokens": 661265628.0, + "reward": 0.89697265625, + "reward_std": 0.33137983083724976, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.82666015625, + "rewards/tag_count_reward/std": 0.2633094787597656, "step": 999 }, { @@ -28986,27 +28986,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.19140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 778.240234375, - "completions/mean_terminated_length": 768.2421264648438, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 1898.0, + "completions/mean_length": 1023.267578125, + "completions/mean_terminated_length": 780.6980590820312, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.34138431339079967, - "grad_norm": 3.1648752689361572, - "kl": 3.103515625, - "learning_rate": 8.495917501767352e-07, - "loss": 0.1261, - "num_tokens": 594422934.0, - "reward": 1.09228515625, - "reward_std": 0.38167354464530945, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.09765625, - "rewards/format_reward/std": 0.29713961482048035, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.18435926735401154, + "grad_norm": 5.361663818359375, + "kl": 9.765625, + "learning_rate": 8.498845230167912e-07, + "loss": 0.7402, + "num_tokens": 661868485.0, + "reward": 0.92041015625, + "reward_std": 0.30022215843200684, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83447265625, + "rewards/tag_count_reward/std": 0.2669280767440796, "step": 1000 }, { @@ -29015,27 +29015,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.197265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 831.1484375, - "completions/mean_terminated_length": 799.4468994140625, - "completions/min_length": 228.0, - "completions/min_terminated_length": 228.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1037.65625, + "completions/mean_terminated_length": 789.3722534179688, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.3417256977041905, - "grad_norm": 1.4682717323303223, - "kl": 4.21875, - "learning_rate": 8.491915118925798e-07, - "loss": 0.2445, - "num_tokens": 594932706.0, - "reward": 1.10595703125, - "reward_std": 0.4507877826690674, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, - "rewards/format_reward/mean": 0.0859375, - "rewards/format_reward/std": 0.28054583072662354, - "rewards/tag_count_reward/mean": 0.89111328125, - "rewards/tag_count_reward/std": 0.21555037796497345, + "grad_norm": 5.13283634185791, + "kl": 9.1875, + "learning_rate": 8.494844442090552e-07, + "loss": 0.6795, + "num_tokens": 662483989.0, + "reward": 0.98681640625, + "reward_std": 0.3550630807876587, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.84619140625, + "rewards/tag_count_reward/std": 0.25345006585121155, "step": 1001 }, { @@ -29044,27 +29044,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 857.244140625, - "completions/mean_terminated_length": 823.76904296875, - "completions/min_length": 17.0, - "completions/min_terminated_length": 17.0, + "completions/max_terminated_length": 1668.0, + "completions/mean_length": 1030.228515625, + "completions/mean_terminated_length": 795.3582153320312, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, "epoch": 0.3420670820175813, - "grad_norm": 1.5017461776733398, - "kl": 4.0234375, - "learning_rate": 8.48790848960796e-07, - "loss": 0.2004, - "num_tokens": 595453535.0, - "reward": 1.103515625, - "reward_std": 0.4274582862854004, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.11328125, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.19906827807426453, + "grad_norm": 6.057972431182861, + "kl": 8.6484375, + "learning_rate": 8.490839400153594e-07, + "loss": 0.6274, + "num_tokens": 663093386.0, + "reward": 0.9345703125, + "reward_std": 0.3002741038799286, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8388671875, + "rewards/tag_count_reward/std": 0.2711919844150543, "step": 1002 }, { @@ -29073,27 +29073,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.193359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 814.6171875, - "completions/mean_terminated_length": 792.5486450195312, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 1033.94140625, + "completions/mean_terminated_length": 790.8619995117188, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, "epoch": 0.3424084663309721, - "grad_norm": 3.0235321521759033, - "kl": 3.5625, - "learning_rate": 8.483897619500517e-07, - "loss": 0.166, - "num_tokens": 595950603.0, - "reward": 1.064453125, - "reward_std": 0.42457592487335205, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.125, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.900390625, - "rewards/tag_count_reward/std": 0.20592933893203735, + "grad_norm": 4.031679153442383, + "kl": 7.8125, + "learning_rate": 8.486830110045779e-07, + "loss": 0.6197, + "num_tokens": 663702748.0, + "reward": 0.875, + "reward_std": 0.27105987071990967, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.1843547374010086, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.26447224617004395, "step": 1003 }, { @@ -29102,27 +29102,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.1796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 866.642578125, - "completions/mean_terminated_length": 833.4317016601562, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1053.92578125, + "completions/mean_terminated_length": 836.1762084960938, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.3427498506443629, - "grad_norm": 1.5792306661605835, - "kl": 3.8671875, - "learning_rate": 8.479882514296165e-07, - "loss": 0.2534, - "num_tokens": 596471172.0, - "reward": 1.07763671875, - "reward_std": 0.439164936542511, - "rewards/accuracy_reward/mean": 0.052419353276491165, - "rewards/accuracy_reward/std": 0.22309617698192596, - "rewards/format_reward/mean": 0.130859375, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.89599609375, - "rewards/tag_count_reward/std": 0.21053139865398407, + "grad_norm": 9.7712984085083, + "kl": 7.7734375, + "learning_rate": 8.482816577461879e-07, + "loss": 0.665, + "num_tokens": 664319206.0, + "reward": 0.88427734375, + "reward_std": 0.32227811217308044, + "rewards/accuracy_reward/mean": 0.05443548411130905, + "rewards/accuracy_reward/std": 0.227104052901268, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83154296875, + "rewards/tag_count_reward/std": 0.26783937215805054, "step": 1004 }, { @@ -29131,27 +29131,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 783.689453125, - "completions/mean_terminated_length": 758.5040283203125, - "completions/min_length": 66.0, - "completions/min_terminated_length": 66.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 952.447265625, + "completions/mean_terminated_length": 773.1749877929688, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.3430912349577537, - "grad_norm": 1.8494607210159302, - "kl": 2.94140625, - "learning_rate": 8.475863179693613e-07, - "loss": 0.1777, - "num_tokens": 596952453.0, - "reward": 1.283203125, - "reward_std": 0.5326836109161377, - "rewards/accuracy_reward/mean": 0.154296875, - "rewards/accuracy_reward/std": 0.36158639192581177, - "rewards/format_reward/mean": 0.2109375, - "rewards/format_reward/std": 0.4083731174468994, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.19119404256343842, + "grad_norm": 11.618585586547852, + "kl": 5.796875, + "learning_rate": 8.478798808102691e-07, + "loss": 0.5098, + "num_tokens": 664886891.0, + "reward": 1.0322265625, + "reward_std": 0.3323245048522949, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8642578125, + "rewards/tag_count_reward/std": 0.2572469115257263, "step": 1005 }, { @@ -29160,27 +29160,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.146484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 849.076171875, - "completions/mean_terminated_length": 812.8912963867188, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 1024.25, + "completions/mean_terminated_length": 848.5491333007812, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.3434326192711445, - "grad_norm": 1.4781209230422974, - "kl": 3.72265625, - "learning_rate": 8.471839621397569e-07, - "loss": 0.1878, - "num_tokens": 597458444.0, - "reward": 1.23876953125, - "reward_std": 0.554077684879303, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.255859375, - "rewards/format_reward/std": 0.43676990270614624, - "rewards/tag_count_reward/mean": 0.89892578125, - "rewards/tag_count_reward/std": 0.1976224035024643, + "grad_norm": 8.186664581298828, + "kl": 6.2734375, + "learning_rate": 8.474776807675032e-07, + "loss": 0.5432, + "num_tokens": 665482571.0, + "reward": 0.97998046875, + "reward_std": 0.3762948513031006, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83935546875, + "rewards/tag_count_reward/std": 0.26944708824157715, "step": 1006 }, { @@ -29189,27 +29189,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.177734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 874.451171875, - "completions/mean_terminated_length": 841.4597778320312, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1065.953125, + "completions/mean_terminated_length": 853.6817626953125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, "epoch": 0.3437740035845353, - "grad_norm": 1.2865853309631348, - "kl": 3.65234375, - "learning_rate": 8.467811845118741e-07, - "loss": 0.1754, - "num_tokens": 597979651.0, - "reward": 1.22607421875, - "reward_std": 0.579481303691864, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.26953125, - "rewards/format_reward/std": 0.44415023922920227, - "rewards/tag_count_reward/mean": 0.89013671875, - "rewards/tag_count_reward/std": 0.2167528122663498, + "grad_norm": 4.170973300933838, + "kl": 7.5, + "learning_rate": 8.470750581891728e-07, + "loss": 0.5832, + "num_tokens": 666101827.0, + "reward": 0.88916015625, + "reward_std": 0.30262479186058044, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83251953125, + "rewards/tag_count_reward/std": 0.2693619728088379, "step": 1007 }, { @@ -29218,27 +29218,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 853.5625, - "completions/mean_terminated_length": 807.5294189453125, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 1077.71484375, + "completions/mean_terminated_length": 853.8029174804688, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.3441153878979261, - "grad_norm": 3.341688394546509, - "kl": 4.61328125, - "learning_rate": 8.463779856573819e-07, - "loss": 0.2443, - "num_tokens": 598495827.0, - "reward": 1.30859375, - "reward_std": 0.6163730621337891, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.357421875, - "rewards/format_reward/std": 0.4797092080116272, - "rewards/tag_count_reward/mean": 0.8984375, - "rewards/tag_count_reward/std": 0.20616121590137482, + "grad_norm": 3.9423916339874268, + "kl": 8.9375, + "learning_rate": 8.466720136471607e-07, + "loss": 0.6858, + "num_tokens": 666732769.0, + "reward": 0.83984375, + "reward_std": 0.3145557641983032, + "rewards/accuracy_reward/mean": 0.025390625, + "rewards/accuracy_reward/std": 0.15746226906776428, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.814453125, + "rewards/tag_count_reward/std": 0.28281864523887634, "step": 1008 }, { @@ -29247,27 +29247,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.154296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 864.275390625, - "completions/mean_terminated_length": 823.6222534179688, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1053.103515625, + "completions/mean_terminated_length": 871.5866088867188, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.3444567722113169, - "grad_norm": 6.600764751434326, - "kl": 4.6953125, - "learning_rate": 8.459743661485472e-07, - "loss": 0.2085, - "num_tokens": 599024640.0, - "reward": 1.33154296875, - "reward_std": 0.6768674254417419, - "rewards/accuracy_reward/mean": 0.060483869165182114, - "rewards/accuracy_reward/std": 0.2386218160390854, - "rewards/format_reward/mean": 0.40625, - "rewards/format_reward/std": 0.49161264300346375, - "rewards/tag_count_reward/mean": 0.86669921875, - "rewards/tag_count_reward/std": 0.22268475592136383, + "grad_norm": 2.4373865127563477, + "kl": 7.953125, + "learning_rate": 8.462685477139489e-07, + "loss": 0.6046, + "num_tokens": 667358262.0, + "reward": 0.87744140625, + "reward_std": 0.31072795391082764, + "rewards/accuracy_reward/mean": 0.05040322616696358, + "rewards/accuracy_reward/std": 0.21899642050266266, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.82861328125, + "rewards/tag_count_reward/std": 0.2732292711734772, "step": 1009 }, { @@ -29276,27 +29276,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.1484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 736.970703125, - "completions/mean_terminated_length": 710.8546142578125, - "completions/min_length": 42.0, - "completions/min_terminated_length": 42.0, + "completions/max_terminated_length": 1855.0, + "completions/mean_length": 954.486328125, + "completions/mean_terminated_length": 763.8738403320312, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.3447981565247077, - "grad_norm": 18.189531326293945, - "kl": 5.4453125, - "learning_rate": 8.455703265582342e-07, - "loss": 0.2215, - "num_tokens": 599478081.0, - "reward": 1.39697265625, - "reward_std": 0.7271381616592407, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.447265625, - "rewards/format_reward/std": 0.4976975917816162, - "rewards/tag_count_reward/mean": 0.85595703125, - "rewards/tag_count_reward/std": 0.21869266033172607, + "grad_norm": 4.03355073928833, + "kl": 7.8125, + "learning_rate": 8.458646609626183e-07, + "loss": 0.5967, + "num_tokens": 667923071.0, + "reward": 0.982421875, + "reward_std": 0.33473697304725647, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.857421875, + "rewards/tag_count_reward/std": 0.25254786014556885, "step": 1010 }, { @@ -29305,27 +29305,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.177734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 779.84765625, - "completions/mean_terminated_length": 754.585693359375, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 1047.126953125, + "completions/mean_terminated_length": 830.7862548828125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.3451395408380985, - "grad_norm": 24.602689743041992, - "kl": 5.59375, - "learning_rate": 8.451658674599032e-07, - "loss": 0.2229, - "num_tokens": 599952787.0, - "reward": 1.3740234375, - "reward_std": 0.6721348166465759, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.474609375, - "rewards/format_reward/std": 0.4998432695865631, - "rewards/tag_count_reward/mean": 0.8486328125, - "rewards/tag_count_reward/std": 0.21453560888767242, + "grad_norm": 4.7540130615234375, + "kl": 9.234375, + "learning_rate": 8.45460353966847e-07, + "loss": 0.6879, + "num_tokens": 668534624.0, + "reward": 0.8916015625, + "reward_std": 0.3210606873035431, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8369140625, + "rewards/tag_count_reward/std": 0.2640654742717743, "step": 1011 }, { @@ -29334,27 +29334,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1956.0, - "completions/mean_length": 766.462890625, - "completions/mean_terminated_length": 727.78466796875, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 1944.0, + "completions/mean_length": 950.912109375, + "completions/mean_terminated_length": 794.185302734375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, "epoch": 0.3454809251514893, - "grad_norm": 25.765657424926758, - "kl": 5.8125, - "learning_rate": 8.447609894276102e-07, - "loss": 0.2457, - "num_tokens": 600418704.0, - "reward": 1.41796875, - "reward_std": 0.6370959281921387, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.50390625, - "rewards/format_reward/std": 0.5004737377166748, - "rewards/tag_count_reward/mean": 0.87109375, - "rewards/tag_count_reward/std": 0.2008649855852127, + "grad_norm": 7.545724391937256, + "kl": 7.21875, + "learning_rate": 8.450556273009104e-07, + "loss": 0.6112, + "num_tokens": 669094979.0, + "reward": 0.92578125, + "reward_std": 0.2982676923274994, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.861328125, + "rewards/tag_count_reward/std": 0.24790431559085846, "step": 1012 }, { @@ -29363,27 +29363,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.150390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 822.58984375, - "completions/mean_terminated_length": 767.5714111328125, - "completions/min_length": 196.0, - "completions/min_terminated_length": 196.0, + "completions/max_terminated_length": 1837.0, + "completions/mean_length": 1024.267578125, + "completions/mean_terminated_length": 843.05517578125, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, "epoch": 0.3458223094648801, - "grad_norm": 13.013208389282227, - "kl": 4.7265625, - "learning_rate": 8.443556930360048e-07, - "loss": 0.2222, - "num_tokens": 600922270.0, - "reward": 1.52880859375, - "reward_std": 0.6884911060333252, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.57421875, - "rewards/format_reward/std": 0.4949444830417633, - "rewards/tag_count_reward/mean": 0.86865234375, - "rewards/tag_count_reward/std": 0.2149244099855423, + "grad_norm": 4.24454927444458, + "kl": 8.3359375, + "learning_rate": 8.4465048153968e-07, + "loss": 0.647, + "num_tokens": 669701804.0, + "reward": 0.9658203125, + "reward_std": 0.3412587642669678, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8466796875, + "rewards/tag_count_reward/std": 0.25781089067459106, "step": 1013 }, { @@ -29392,27 +29392,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 790.443359375, - "completions/mean_terminated_length": 752.4888916015625, - "completions/min_length": 72.0, - "completions/min_terminated_length": 72.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 945.859375, + "completions/mean_terminated_length": 794.0089111328125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.3461636937782709, - "grad_norm": 6.748332977294922, - "kl": 4.6328125, - "learning_rate": 8.439499788603318e-07, - "loss": 0.2348, - "num_tokens": 601397009.0, - "reward": 1.5400390625, - "reward_std": 0.7158875465393066, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.58203125, - "rewards/format_reward/std": 0.4937073290348053, - "rewards/tag_count_reward/mean": 0.8701171875, - "rewards/tag_count_reward/std": 0.21382176876068115, + "grad_norm": 2.692584276199341, + "kl": 7.4921875, + "learning_rate": 8.442449172586224e-07, + "loss": 0.5694, + "num_tokens": 670256116.0, + "reward": 1.0166015625, + "reward_std": 0.34034305810928345, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8720703125, + "rewards/tag_count_reward/std": 0.23614485561847687, "step": 1014 }, { @@ -29421,27 +29421,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.111328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 845.712890625, - "completions/mean_terminated_length": 794.291259765625, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 973.28515625, + "completions/mean_terminated_length": 838.6505737304688, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.3465050780916617, - "grad_norm": 4.612382888793945, - "kl": 3.69140625, - "learning_rate": 8.435438474764281e-07, - "loss": 0.2272, - "num_tokens": 601906414.0, - "reward": 1.58349609375, - "reward_std": 0.5909501314163208, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.66796875, - "rewards/format_reward/std": 0.47140273451805115, - "rewards/tag_count_reward/mean": 0.88232421875, - "rewards/tag_count_reward/std": 0.20912423729896545, + "grad_norm": 6.587097644805908, + "kl": 6.359375, + "learning_rate": 8.438389350337988e-07, + "loss": 0.5112, + "num_tokens": 670830838.0, + "reward": 0.87890625, + "reward_std": 0.26306241750717163, + "rewards/accuracy_reward/mean": 0.029296875, + "rewards/accuracy_reward/std": 0.16880230605602264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.849609375, + "rewards/tag_count_reward/std": 0.26696789264678955, "step": 1015 }, { @@ -29450,27 +29450,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1952.0, - "completions/mean_length": 837.359375, - "completions/mean_terminated_length": 803.3252563476562, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 994.8515625, + "completions/mean_terminated_length": 878.3427734375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.3468464624050525, - "grad_norm": 5.700664520263672, - "kl": 3.234375, - "learning_rate": 8.431372994607225e-07, - "loss": 0.1731, - "num_tokens": 602408326.0, - "reward": 1.67431640625, - "reward_std": 0.6381564140319824, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.689453125, - "rewards/format_reward/std": 0.46317005157470703, - "rewards/tag_count_reward/mean": 0.90283203125, - "rewards/tag_count_reward/std": 0.1958642452955246, + "grad_norm": 3.6660964488983154, + "kl": 6.875, + "learning_rate": 8.434325354418639e-07, + "loss": 0.5185, + "num_tokens": 671413386.0, + "reward": 0.958984375, + "reward_std": 0.3116724193096161, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.2639225125312805, "step": 1016 }, { @@ -29479,27 +29479,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1893.0, - "completions/mean_length": 794.33984375, - "completions/mean_terminated_length": 774.4404907226562, - "completions/min_length": 25.0, - "completions/min_terminated_length": 25.0, + "completions/max_terminated_length": 1940.0, + "completions/mean_length": 942.5546875, + "completions/mean_terminated_length": 812.2183837890625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.34718784671844327, - "grad_norm": 5.525385856628418, - "kl": 2.607421875, - "learning_rate": 8.427303353902359e-07, - "loss": 0.1343, - "num_tokens": 602895668.0, - "reward": 1.744140625, - "reward_std": 0.5869893431663513, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.7421875, - "rewards/format_reward/std": 0.43785804510116577, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.17446976900100708, + "grad_norm": 6.159041404724121, + "kl": 7.140625, + "learning_rate": 8.430257190600653e-07, + "loss": 0.5789, + "num_tokens": 671976614.0, + "reward": 0.94970703125, + "reward_std": 0.31130433082580566, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.84814453125, + "rewards/tag_count_reward/std": 0.26590588688850403, "step": 1017 }, { @@ -29508,27 +29508,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 832.45703125, - "completions/mean_terminated_length": 810.7077026367188, - "completions/min_length": 41.0, - "completions/min_terminated_length": 41.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 971.140625, + "completions/mean_terminated_length": 830.8873901367188, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.3475292310318341, - "grad_norm": 2.698788642883301, - "kl": 3.30859375, - "learning_rate": 8.423229558425796e-07, - "loss": 0.1227, - "num_tokens": 603405390.0, - "reward": 1.68603515625, - "reward_std": 0.6165211200714111, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.712890625, - "rewards/format_reward/std": 0.45285552740097046, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.19274641573429108, + "grad_norm": 3.7741825580596924, + "kl": 8.0390625, + "learning_rate": 8.426184864662426e-07, + "loss": 0.5909, + "num_tokens": 672557342.0, + "reward": 0.93212890625, + "reward_std": 0.29936298727989197, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86181640625, + "rewards/tag_count_reward/std": 0.25087326765060425, "step": 1018 }, { @@ -29537,27 +29537,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 892.26953125, - "completions/mean_terminated_length": 840.3795776367188, - "completions/min_length": 78.0, - "completions/min_terminated_length": 78.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 999.513671875, + "completions/mean_terminated_length": 870.752197265625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.3478706153452249, - "grad_norm": 1.207611322402954, - "kl": 5.46875, - "learning_rate": 8.419151613959539e-07, - "loss": 0.3078, - "num_tokens": 603933416.0, - "reward": 1.6123046875, - "reward_std": 0.6389665603637695, + "grad_norm": 4.3473405838012695, + "kl": 8.03125, + "learning_rate": 8.422108382388268e-07, + "loss": 0.5948, + "num_tokens": 673140277.0, + "reward": 0.87744140625, + "reward_std": 0.2634720504283905, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15143637359142303, - "rewards/format_reward/mean": 0.69921875, - "rewards/format_reward/std": 0.45904624462127686, - "rewards/tag_count_reward/mean": 0.8896484375, - "rewards/tag_count_reward/std": 0.21734988689422607, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.85400390625, + "rewards/tag_count_reward/std": 0.2527705132961273, "step": 1019 }, { @@ -29566,27 +29566,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 810.12109375, - "completions/mean_terminated_length": 775.3212280273438, - "completions/min_length": 16.0, - "completions/min_terminated_length": 16.0, - "epoch": 0.3482119996586157, - "grad_norm": 1.1834700107574463, - "kl": 6.171875, - "learning_rate": 8.415069526291486e-07, - "loss": 0.3246, - "num_tokens": 604423158.0, - "reward": 1.67724609375, - "reward_std": 0.7229315042495728, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, - "rewards/format_reward/mean": 0.673828125, - "rewards/format_reward/std": 0.4692695140838623, - "rewards/tag_count_reward/mean": 0.88037109375, - "rewards/tag_count_reward/std": 0.21551933884620667, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 902.4375, + "completions/mean_terminated_length": 775.7050170898438, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.3482119996586157, + "grad_norm": 4.2296600341796875, + "kl": 7.4921875, + "learning_rate": 8.418027749568388e-07, + "loss": 0.5768, + "num_tokens": 673677285.0, + "reward": 1.02734375, + "reward_std": 0.34143146872520447, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.876953125, + "rewards/tag_count_reward/std": 0.24229030311107635, "step": 1020 }, { @@ -29595,27 +29595,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 875.41015625, - "completions/mean_terminated_length": 847.26806640625, - "completions/min_length": 34.0, - "completions/min_terminated_length": 34.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 992.162109375, + "completions/mean_terminated_length": 882.9375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, "epoch": 0.34855338397200647, - "grad_norm": 1.2013593912124634, - "kl": 5.6328125, - "learning_rate": 8.410983301215415e-07, - "loss": 0.3053, - "num_tokens": 604940456.0, - "reward": 1.69384765625, - "reward_std": 0.6755542159080505, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.71484375, - "rewards/format_reward/std": 0.45193037390708923, - "rewards/tag_count_reward/mean": 0.88720703125, - "rewards/tag_count_reward/std": 0.21295472979545593, + "grad_norm": 3.2296805381774902, + "kl": 8.6953125, + "learning_rate": 8.413942971998897e-07, + "loss": 0.6213, + "num_tokens": 674254360.0, + "reward": 0.95849609375, + "reward_std": 0.3326249122619629, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.85107421875, + "rewards/tag_count_reward/std": 0.2634146809577942, "step": 1021 }, { @@ -29624,27 +29624,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.119140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 855.462890625, - "completions/mean_terminated_length": 834.1251831054688, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1026.865234375, + "completions/mean_terminated_length": 888.7516479492188, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.3488947682853973, - "grad_norm": 3.499455213546753, - "kl": 6.1875, - "learning_rate": 8.406892944530973e-07, - "loss": 0.277, - "num_tokens": 605451221.0, - "reward": 1.6103515625, - "reward_std": 0.6938778758049011, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.673828125, - "rewards/format_reward/std": 0.4692695140838623, - "rewards/tag_count_reward/mean": 0.8720703125, - "rewards/tag_count_reward/std": 0.21271060407161713, + "grad_norm": 4.401437759399414, + "kl": 8.5390625, + "learning_rate": 8.409854055481784e-07, + "loss": 0.5988, + "num_tokens": 674852883.0, + "reward": 0.916015625, + "reward_std": 0.31277602910995483, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.841796875, + "rewards/tag_count_reward/std": 0.2683957815170288, "step": 1022 }, { @@ -29653,27 +29653,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 828.400390625, - "completions/mean_terminated_length": 801.6227416992188, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 928.533203125, + "completions/mean_terminated_length": 838.786865234375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, "epoch": 0.3492361525987881, - "grad_norm": 3.2929532527923584, - "kl": 6.3671875, - "learning_rate": 8.402798462043673e-07, - "loss": 0.2915, - "num_tokens": 605950674.0, - "reward": 1.61669921875, - "reward_std": 0.710044264793396, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.650390625, - "rewards/format_reward/std": 0.47731292247772217, - "rewards/tag_count_reward/mean": 0.87255859375, - "rewards/tag_count_reward/std": 0.2211734801530838, + "grad_norm": 2.7053239345550537, + "kl": 7.140625, + "learning_rate": 8.405761005824927e-07, + "loss": 0.5257, + "num_tokens": 675403604.0, + "reward": 0.97216796875, + "reward_std": 0.33417749404907227, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.85888671875, + "rewards/tag_count_reward/std": 0.25457483530044556, "step": 1023 }, { @@ -29682,27 +29682,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 858.580078125, - "completions/mean_terminated_length": 825.1425170898438, - "completions/min_length": 9.0, - "completions/min_terminated_length": 9.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 936.703125, + "completions/mean_terminated_length": 857.6568603515625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.3495775369121789, - "grad_norm": 3.8285183906555176, - "kl": 6.796875, - "learning_rate": 8.398699859564887e-07, - "loss": 0.3447, - "num_tokens": 606455771.0, - "reward": 1.66162109375, - "reward_std": 0.7346779108047485, - "rewards/accuracy_reward/mean": 0.1270161271095276, - "rewards/accuracy_reward/std": 0.33332720398902893, - "rewards/format_reward/mean": 0.6640625, - "rewards/format_reward/std": 0.4727790653705597, - "rewards/tag_count_reward/mean": 0.87451171875, - "rewards/tag_count_reward/std": 0.2189633846282959, + "grad_norm": 3.5899410247802734, + "kl": 7.125, + "learning_rate": 8.401663828842066e-07, + "loss": 0.4985, + "num_tokens": 675948700.0, + "reward": 1.01611328125, + "reward_std": 0.3711642026901245, + "rewards/accuracy_reward/mean": 0.14919355511665344, + "rewards/accuracy_reward/std": 0.3566388487815857, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87158203125, + "rewards/tag_count_reward/std": 0.24973182380199432, "step": 1024 }, { @@ -29711,27 +29711,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 893.4609375, - "completions/mean_terminated_length": 839.157470703125, - "completions/min_length": 50.0, - "completions/min_terminated_length": 50.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 943.154296875, + "completions/mean_terminated_length": 871.9480590820312, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, "epoch": 0.34991892122556967, - "grad_norm": 1.2652173042297363, - "kl": 5.65625, - "learning_rate": 8.39459714291183e-07, - "loss": 0.283, - "num_tokens": 606993927.0, - "reward": 1.5908203125, - "reward_std": 0.7199634909629822, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.640625, - "rewards/format_reward/std": 0.48028653860092163, - "rewards/tag_count_reward/mean": 0.8681640625, - "rewards/tag_count_reward/std": 0.22546352446079254, + "grad_norm": 2.5891060829162598, + "kl": 6.71875, + "learning_rate": 8.39756253035281e-07, + "loss": 0.4837, + "num_tokens": 676512299.0, + "reward": 0.998046875, + "reward_std": 0.3223692774772644, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.869140625, + "rewards/tag_count_reward/std": 0.24821248650550842, "step": 1025 }, { @@ -29742,25 +29742,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 881.57421875, - "completions/mean_terminated_length": 819.1727905273438, - "completions/min_length": 76.0, - "completions/min_terminated_length": 76.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 905.380859375, + "completions/mean_terminated_length": 844.2530517578125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, "epoch": 0.3502603055389605, - "grad_norm": 2.198275327682495, - "kl": 5.546875, - "learning_rate": 8.390490317907557e-07, - "loss": 0.3021, - "num_tokens": 607520765.0, - "reward": 1.5654296875, - "reward_std": 0.6948752403259277, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.62890625, - "rewards/format_reward/std": 0.4835699498653412, + "grad_norm": 8.612765312194824, + "kl": 6.359375, + "learning_rate": 8.393457116182619e-07, + "loss": 0.4515, + "num_tokens": 677051326.0, + "reward": 0.9443359375, + "reward_std": 0.28609099984169006, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8779296875, - "rewards/tag_count_reward/std": 0.21271060407161713, + "rewards/tag_count_reward/std": 0.23923231661319733, "step": 1026 }, { @@ -29769,27 +29769,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 859.9921875, - "completions/mean_terminated_length": 819.1919555664062, - "completions/min_length": 214.0, - "completions/min_terminated_length": 214.0, + "completions/max_terminated_length": 1908.0, + "completions/mean_length": 877.6328125, + "completions/mean_terminated_length": 849.5440673828125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.3506016898523513, - "grad_norm": 3.3149538040161133, - "kl": 4.89453125, - "learning_rate": 8.386379390380956e-07, - "loss": 0.2831, - "num_tokens": 608032809.0, - "reward": 1.68896484375, - "reward_std": 0.6970325708389282, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, - "rewards/format_reward/mean": 0.6875, - "rewards/format_reward/std": 0.4639657139778137, - "rewards/tag_count_reward/mean": 0.88427734375, - "rewards/tag_count_reward/std": 0.21424756944179535, + "grad_norm": 4.508106708526611, + "kl": 5.7890625, + "learning_rate": 8.389347592162799e-07, + "loss": 0.3741, + "num_tokens": 677572402.0, + "reward": 1.0361328125, + "reward_std": 0.33224111795425415, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8759765625, + "rewards/tag_count_reward/std": 0.2438058853149414, "step": 1027 }, { @@ -29800,25 +29800,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1970.0, - "completions/mean_length": 913.5859375, - "completions/mean_terminated_length": 860.22900390625, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/max_terminated_length": 1770.0, + "completions/mean_length": 882.857421875, + "completions/mean_terminated_length": 828.05517578125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.3509430741657421, - "grad_norm": 3.4048895835876465, - "kl": 4.83984375, - "learning_rate": 8.382264366166736e-07, - "loss": 0.2893, - "num_tokens": 608572741.0, - "reward": 1.69482421875, - "reward_std": 0.7132976055145264, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, - "rewards/format_reward/mean": 0.6796875, - "rewards/format_reward/std": 0.4670529365539551, - "rewards/tag_count_reward/mean": 0.88232421875, - "rewards/tag_count_reward/std": 0.21772050857543945, + "grad_norm": 4.022824287414551, + "kl": 5.8515625, + "learning_rate": 8.385233964130493e-07, + "loss": 0.3832, + "num_tokens": 678096601.0, + "reward": 1.03564453125, + "reward_std": 0.34572306275367737, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87158203125, + "rewards/tag_count_reward/std": 0.24478521943092346, "step": 1028 }, { @@ -29827,27 +29827,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 914.361328125, - "completions/mean_terminated_length": 856.1663818359375, - "completions/min_length": 94.0, - "completions/min_terminated_length": 94.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 897.05859375, + "completions/mean_terminated_length": 867.0741577148438, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, "epoch": 0.35128445847913287, - "grad_norm": 1.075799584388733, - "kl": 5.4921875, - "learning_rate": 8.378145251105423e-07, - "loss": 0.2675, - "num_tokens": 609114430.0, - "reward": 1.484375, - "reward_std": 0.7132372856140137, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.59375, - "rewards/format_reward/std": 0.49161264300346375, - "rewards/tag_count_reward/mean": 0.84765625, - "rewards/tag_count_reward/std": 0.23716437816619873, + "grad_norm": 4.801035404205322, + "kl": 4.69140625, + "learning_rate": 8.381116237928677e-07, + "loss": 0.3316, + "num_tokens": 678629431.0, + "reward": 0.970703125, + "reward_std": 0.2831045091152191, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.890625, + "rewards/tag_count_reward/std": 0.22986425459384918, "step": 1029 }, { @@ -29856,27 +29856,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 889.15625, - "completions/mean_terminated_length": 811.9000244140625, - "completions/min_length": 2.0, - "completions/min_terminated_length": 2.0, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 868.1484375, + "completions/mean_terminated_length": 844.6454467773438, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.3516258427925237, - "grad_norm": 1.336026668548584, - "kl": 6.1953125, - "learning_rate": 8.374022051043344e-07, - "loss": 0.3564, - "num_tokens": 609654062.0, - "reward": 1.5947265625, - "reward_std": 0.7138975262641907, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.638671875, - "rewards/format_reward/std": 0.48085519671440125, - "rewards/tag_count_reward/mean": 0.8623046875, - "rewards/tag_count_reward/std": 0.2358209192752838, + "grad_norm": 5.5516228675842285, + "kl": 4.25, + "learning_rate": 8.376994419406141e-07, + "loss": 0.331, + "num_tokens": 679158307.0, + "reward": 1.02783203125, + "reward_std": 0.32698333263397217, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90283203125, + "rewards/tag_count_reward/std": 0.2137787640094757, "step": 1030 }, { @@ -29885,27 +29885,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1946.0, - "completions/mean_length": 787.4921875, - "completions/mean_terminated_length": 762.3825073242188, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 847.8671875, + "completions/mean_terminated_length": 816.6011962890625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, "epoch": 0.3519672271059145, - "grad_norm": 2.8935294151306152, - "kl": 4.9140625, - "learning_rate": 8.36989477183263e-07, - "loss": 0.2718, - "num_tokens": 610135434.0, - "reward": 1.62109375, - "reward_std": 0.6810424327850342, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.646484375, - "rewards/format_reward/std": 0.47852855920791626, - "rewards/tag_count_reward/mean": 0.89453125, - "rewards/tag_count_reward/std": 0.20177629590034485, + "grad_norm": 3.7457098960876465, + "kl": 5.0859375, + "learning_rate": 8.372868514417496e-07, + "loss": 0.3782, + "num_tokens": 679670591.0, + "reward": 1.02099609375, + "reward_std": 0.30011674761772156, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.2198561728000641, "step": 1031 }, { @@ -29914,27 +29914,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 808.677734375, - "completions/mean_terminated_length": 768.6995849609375, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 855.859375, + "completions/mean_terminated_length": 829.6846313476562, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.3523086114193053, - "grad_norm": 1.5922437906265259, - "kl": 4.234375, - "learning_rate": 8.365763419331199e-07, - "loss": 0.1738, - "num_tokens": 610625189.0, - "reward": 1.63134765625, - "reward_std": 0.6297500729560852, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.6953125, - "rewards/format_reward/std": 0.4607250988483429, - "rewards/tag_count_reward/mean": 0.89111328125, - "rewards/tag_count_reward/std": 0.2038862407207489, + "grad_norm": 3.989301919937134, + "kl": 4.046875, + "learning_rate": 8.368738528823152e-07, + "loss": 0.2937, + "num_tokens": 680184503.0, + "reward": 0.9638671875, + "reward_std": 0.2415887415409088, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.2075263112783432, "step": 1032 }, { @@ -29943,27 +29943,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 843.97265625, - "completions/mean_terminated_length": 822.4293823242188, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 898.009765625, + "completions/mean_terminated_length": 856.1072998046875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.35264999573269606, - "grad_norm": 1.8940683603286743, - "kl": 4.8515625, - "learning_rate": 8.361627999402748e-07, - "loss": 0.1957, - "num_tokens": 611133239.0, - "reward": 1.61669921875, - "reward_std": 0.6727503538131714, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.669921875, - "rewards/format_reward/std": 0.47070086002349854, - "rewards/tag_count_reward/mean": 0.87841796875, - "rewards/tag_count_reward/std": 0.21612590551376343, + "grad_norm": 4.103821754455566, + "kl": 6.1328125, + "learning_rate": 8.364604468489316e-07, + "loss": 0.3888, + "num_tokens": 680720220.0, + "reward": 0.96923828125, + "reward_std": 0.29523128271102905, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87939453125, + "rewards/tag_count_reward/std": 0.23404096066951752, "step": 1033 }, { @@ -29972,27 +29972,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1943.0, - "completions/mean_length": 789.708984375, - "completions/mean_terminated_length": 762.0818481445312, - "completions/min_length": 35.0, - "completions/min_terminated_length": 35.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 852.36328125, + "completions/mean_terminated_length": 808.797607421875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.3529913800460869, - "grad_norm": 1.276130199432373, - "kl": 5.2109375, - "learning_rate": 8.357488517916752e-07, - "loss": 0.2572, - "num_tokens": 611609122.0, - "reward": 1.68603515625, - "reward_std": 0.6626811623573303, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.697265625, - "rewards/format_reward/std": 0.45989060401916504, - "rewards/tag_count_reward/mean": 0.89306640625, - "rewards/tag_count_reward/std": 0.21138295531272888, + "grad_norm": 4.786602020263672, + "kl": 5.65625, + "learning_rate": 8.360466339287985e-07, + "loss": 0.39, + "num_tokens": 681228182.0, + "reward": 1.033203125, + "reward_std": 0.31543678045272827, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.22379815578460693, "step": 1034 }, { @@ -30001,27 +30001,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 888.078125, - "completions/mean_terminated_length": 840.9268188476562, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 905.75390625, + "completions/mean_terminated_length": 880.6746215820312, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.3533327643594777, - "grad_norm": 1.1660982370376587, - "kl": 5.3359375, - "learning_rate": 8.353344980748446e-07, - "loss": 0.2584, - "num_tokens": 612142346.0, - "reward": 1.60986328125, - "reward_std": 0.6833149194717407, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.658203125, - "rewards/format_reward/std": 0.4747757613658905, - "rewards/tag_count_reward/mean": 0.88330078125, - "rewards/tag_count_reward/std": 0.210835263133049, + "grad_norm": 44.86424255371094, + "kl": 6.5859375, + "learning_rate": 8.356324147096931e-07, + "loss": 0.4118, + "num_tokens": 681770456.0, + "reward": 0.9501953125, + "reward_std": 0.26379573345184326, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8916015625, + "rewards/tag_count_reward/std": 0.22056184709072113, "step": 1035 }, { @@ -30030,27 +30030,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 841.369140625, - "completions/mean_terminated_length": 812.4100341796875, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 1800.0, + "completions/mean_length": 899.765625, + "completions/mean_terminated_length": 810.3241577148438, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.3536741486728685, - "grad_norm": 1.5263030529022217, - "kl": 3.9296875, - "learning_rate": 8.349197393778825e-07, - "loss": 0.1906, - "num_tokens": 612654679.0, - "reward": 1.68212890625, - "reward_std": 0.6412637233734131, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.705078125, - "rewards/format_reward/std": 0.4564536213874817, - "rewards/tag_count_reward/mean": 0.90087890625, - "rewards/tag_count_reward/std": 0.20467674732208252, + "grad_norm": 11.403457641601562, + "kl": 8.484375, + "learning_rate": 8.352177897799701e-07, + "loss": 0.5758, + "num_tokens": 682312688.0, + "reward": 0.9619140625, + "reward_std": 0.31183531880378723, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8876953125, + "rewards/tag_count_reward/std": 0.22412028908729553, "step": 1036 }, { @@ -30059,27 +30059,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 927.36328125, - "completions/mean_terminated_length": 862.5330200195312, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 881.23828125, + "completions/mean_terminated_length": 843.6007690429688, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.35401553298625926, - "grad_norm": 1.7529670000076294, - "kl": 4.578125, - "learning_rate": 8.345045762894628e-07, - "loss": 0.2572, - "num_tokens": 613204161.0, - "reward": 1.623046875, - "reward_std": 0.6420872211456299, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.712890625, - "rewards/format_reward/std": 0.45285552740097046, - "rewards/tag_count_reward/mean": 0.884765625, - "rewards/tag_count_reward/std": 0.22041666507720947, + "grad_norm": 3.574113368988037, + "kl": 5.59375, + "learning_rate": 8.348027597285601e-07, + "loss": 0.3732, + "num_tokens": 682838554.0, + "reward": 0.931640625, + "reward_std": 0.24733318388462067, + "rewards/accuracy_reward/mean": 0.02734375, + "rewards/accuracy_reward/std": 0.16324250400066376, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.904296875, + "rewards/tag_count_reward/std": 0.2118576318025589, "step": 1037 }, { @@ -30088,27 +30088,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 954.96875, - "completions/mean_terminated_length": 896.4937744140625, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 991.48828125, + "completions/mean_terminated_length": 937.2526245117188, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, "epoch": 0.3543569172996501, - "grad_norm": 1.533047080039978, - "kl": 4.6484375, - "learning_rate": 8.340890093988336e-07, - "loss": 0.2689, - "num_tokens": 613764225.0, - "reward": 1.6484375, - "reward_std": 0.6554567813873291, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.701171875, - "rewards/format_reward/std": 0.45819199085235596, - "rewards/tag_count_reward/mean": 0.888671875, - "rewards/tag_count_reward/std": 0.2117132544517517, + "grad_norm": 3.032055377960205, + "kl": 6.01953125, + "learning_rate": 8.343873251449699e-07, + "loss": 0.4065, + "num_tokens": 683417316.0, + "reward": 0.93896484375, + "reward_std": 0.2611902356147766, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.88037109375, + "rewards/tag_count_reward/std": 0.2345426082611084, "step": 1038 }, { @@ -30117,27 +30117,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 851.8046875, - "completions/mean_terminated_length": 823.0960693359375, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 871.505859375, + "completions/mean_terminated_length": 852.8314208984375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.3546983016130409, - "grad_norm": 2.632937431335449, - "kl": 3.77734375, - "learning_rate": 8.336730392958163e-07, - "loss": 0.2126, - "num_tokens": 614272989.0, - "reward": 1.77001953125, - "reward_std": 0.5945524573326111, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.76171875, - "rewards/format_reward/std": 0.42644867300987244, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.17819732427597046, + "grad_norm": 7.762738227844238, + "kl": 4.37890625, + "learning_rate": 8.3397148661928e-07, + "loss": 0.2951, + "num_tokens": 683936167.0, + "reward": 1.0302734375, + "reward_std": 0.2508530914783478, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.19272471964359283, "step": 1039 }, { @@ -30146,27 +30146,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 842.3046875, - "completions/mean_terminated_length": 790.7373046875, - "completions/min_length": 8.0, - "completions/min_terminated_length": 8.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 857.955078125, + "completions/mean_terminated_length": 831.8263549804688, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, "epoch": 0.3550396859264317, - "grad_norm": 2.310908794403076, - "kl": 5.375, - "learning_rate": 8.332566665708041e-07, - "loss": 0.3472, - "num_tokens": 614776889.0, - "reward": 1.7568359375, - "reward_std": 0.694198489189148, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, - "rewards/format_reward/mean": 0.7421875, - "rewards/format_reward/std": 0.43785804510116577, - "rewards/tag_count_reward/mean": 0.8955078125, - "rewards/tag_count_reward/std": 0.21057961881160736, + "grad_norm": 1.9511668682098389, + "kl": 3.5, + "learning_rate": 8.33555244742145e-07, + "loss": 0.2497, + "num_tokens": 684448080.0, + "reward": 1.06103515625, + "reward_std": 0.2851088047027588, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17669491469860077, "step": 1040 }, { @@ -30175,27 +30175,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 787.267578125, - "completions/mean_terminated_length": 751.8252563476562, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 834.376953125, + "completions/mean_terminated_length": 795.227783203125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.35538107023982246, - "grad_norm": 2.5846283435821533, - "kl": 5.9921875, - "learning_rate": 8.328398918147622e-07, - "loss": 0.3324, - "num_tokens": 615262434.0, - "reward": 1.67724609375, - "reward_std": 0.6175016760826111, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.7265625, - "rewards/format_reward/std": 0.4461594223976135, - "rewards/tag_count_reward/mean": 0.88623046875, - "rewards/tag_count_reward/std": 0.22145411372184753, + "grad_norm": 4.643179893493652, + "kl": 3.9765625, + "learning_rate": 8.331386001047927e-07, + "loss": 0.2798, + "num_tokens": 684957745.0, + "reward": 0.99267578125, + "reward_std": 0.22653181850910187, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.19190676510334015, "step": 1041 }, { @@ -30204,27 +30204,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1964.0, - "completions/mean_length": 830.2890625, - "completions/mean_terminated_length": 773.0142822265625, - "completions/min_length": 87.0, - "completions/min_terminated_length": 87.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 863.89453125, + "completions/mean_terminated_length": 820.7490234375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, "epoch": 0.3557224545532133, - "grad_norm": 5.108445644378662, - "kl": 7.3046875, - "learning_rate": 8.32422715619226e-07, - "loss": 0.4022, - "num_tokens": 615761830.0, - "reward": 1.66845703125, - "reward_std": 0.6759384274482727, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.69921875, - "rewards/format_reward/std": 0.45904624462127686, - "rewards/tag_count_reward/mean": 0.87744140625, - "rewards/tag_count_reward/std": 0.22878453135490417, + "grad_norm": 4.2955851554870605, + "kl": 3.6875, + "learning_rate": 8.327215532990221e-07, + "loss": 0.2524, + "num_tokens": 685474347.0, + "reward": 1.0654296875, + "reward_std": 0.24000626802444458, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.19806493818759918, "step": 1042 }, { @@ -30233,27 +30233,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1935.0, - "completions/mean_length": 924.916015625, - "completions/mean_terminated_length": 876.8818969726562, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 958.12109375, + "completions/mean_terminated_length": 929.7274780273438, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.3560638388666041, - "grad_norm": 2.7219574451446533, - "kl": 5.99609375, - "learning_rate": 8.320051385763005e-07, - "loss": 0.32, - "num_tokens": 616322107.0, - "reward": 1.66943359375, - "reward_std": 0.6089550256729126, - "rewards/accuracy_reward/mean": 0.058467742055654526, - "rewards/accuracy_reward/std": 0.23486268520355225, - "rewards/format_reward/mean": 0.7109375, - "rewards/format_reward/std": 0.45377036929130554, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.19348861277103424, + "grad_norm": 5.902117729187012, + "kl": 3.48046875, + "learning_rate": 8.323041049172048e-07, + "loss": 0.246, + "num_tokens": 686051625.0, + "reward": 0.99560546875, + "reward_std": 0.2779981195926666, + "rewards/accuracy_reward/mean": 0.08266129344701767, + "rewards/accuracy_reward/std": 0.2756475806236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.19800402224063873, "step": 1043 }, { @@ -30262,27 +30262,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 845.603515625, - "completions/mean_terminated_length": 794.17724609375, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 877.81640625, + "completions/mean_terminated_length": 825.2775268554688, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.3564052231799949, - "grad_norm": 2.1691174507141113, - "kl": 6.421875, - "learning_rate": 8.315871612786604e-07, - "loss": 0.345, - "num_tokens": 616840768.0, - "reward": 1.6875, - "reward_std": 0.692090630531311, - "rewards/accuracy_reward/mean": 0.11088709533214569, - "rewards/accuracy_reward/std": 0.3143092691898346, - "rewards/format_reward/mean": 0.697265625, - "rewards/format_reward/std": 0.45989060401916504, - "rewards/tag_count_reward/mean": 0.8828125, - "rewards/tag_count_reward/std": 0.22434400022029877, + "grad_norm": 3.5963375568389893, + "kl": 4.328125, + "learning_rate": 8.318862555522816e-07, + "loss": 0.3361, + "num_tokens": 686586779.0, + "reward": 1.0322265625, + "reward_std": 0.27012765407562256, + "rewards/accuracy_reward/mean": 0.1088709682226181, + "rewards/accuracy_reward/std": 0.31179171800613403, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.19028981029987335, "step": 1044 }, { @@ -30291,27 +30291,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 823.98828125, - "completions/mean_terminated_length": 750.4968872070312, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 847.267578125, + "completions/mean_terminated_length": 788.2151489257812, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.35674660749338566, - "grad_norm": 1.5825062990188599, - "kl": 6.8515625, - "learning_rate": 8.311687843195481e-07, - "loss": 0.4378, - "num_tokens": 617346394.0, - "reward": 1.66796875, - "reward_std": 0.6295381784439087, - "rewards/accuracy_reward/mean": 0.08064515888690948, - "rewards/accuracy_reward/std": 0.2725643217563629, - "rewards/format_reward/mean": 0.7109375, - "rewards/format_reward/std": 0.45377036929130554, - "rewards/tag_count_reward/mean": 0.87890625, - "rewards/tag_count_reward/std": 0.2308928668498993, + "grad_norm": 2.5916624069213867, + "kl": 5.203125, + "learning_rate": 8.314680057977636e-07, + "loss": 0.3781, + "num_tokens": 687104324.0, + "reward": 1.0283203125, + "reward_std": 0.26283249258995056, + "rewards/accuracy_reward/mean": 0.12298387289047241, + "rewards/accuracy_reward/std": 0.32875028252601624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9091796875, + "rewards/tag_count_reward/std": 0.20465165376663208, "step": 1045 }, { @@ -30322,25 +30322,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 857.66015625, - "completions/mean_terminated_length": 799.1188354492188, - "completions/min_length": 58.0, - "completions/min_terminated_length": 58.0, + "completions/max_terminated_length": 1971.0, + "completions/mean_length": 870.857421875, + "completions/mean_terminated_length": 812.9651489257812, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.3570879918067765, - "grad_norm": 2.7481093406677246, - "kl": 4.765625, - "learning_rate": 8.307500082927726e-07, - "loss": 0.2886, - "num_tokens": 617863468.0, - "reward": 1.62939453125, - "reward_std": 0.6399465799331665, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.708984375, - "rewards/format_reward/std": 0.45467492938041687, - "rewards/tag_count_reward/mean": 0.88916015625, - "rewards/tag_count_reward/std": 0.22238846123218536, + "grad_norm": 3.0527377128601074, + "kl": 4.7890625, + "learning_rate": 8.3104935624773e-07, + "loss": 0.3495, + "num_tokens": 687628155.0, + "reward": 0.97705078125, + "reward_std": 0.24015676975250244, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.1931772381067276, "step": 1046 }, { @@ -30349,27 +30349,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1773.0, - "completions/mean_length": 808.796875, - "completions/mean_terminated_length": 763.6437377929688, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 851.97265625, + "completions/mean_terminated_length": 774.8898315429688, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.3574293761201673, - "grad_norm": 2.5364630222320557, - "kl": 4.58984375, - "learning_rate": 8.303308337927103e-07, - "loss": 0.2764, - "num_tokens": 618359188.0, - "reward": 1.73046875, - "reward_std": 0.656836211681366, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.74609375, - "rewards/format_reward/std": 0.43567025661468506, - "rewards/tag_count_reward/mean": 0.90234375, - "rewards/tag_count_reward/std": 0.20686452090740204, + "grad_norm": 4.23346471786499, + "kl": 6.6171875, + "learning_rate": 8.306303074968283e-07, + "loss": 0.4729, + "num_tokens": 688145981.0, + "reward": 1.009765625, + "reward_std": 0.2975313663482666, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.19913546741008759, "step": 1047 }, { @@ -30378,27 +30378,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 781.765625, - "completions/mean_terminated_length": 727.6090087890625, - "completions/min_length": 40.0, - "completions/min_terminated_length": 40.0, + "completions/max_terminated_length": 1883.0, + "completions/mean_length": 828.525390625, + "completions/mean_terminated_length": 747.2271118164062, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.3577707604335581, - "grad_norm": 4.034943580627441, - "kl": 5.46875, - "learning_rate": 8.299112614143028e-07, - "loss": 0.3919, - "num_tokens": 618843836.0, - "reward": 1.7705078125, - "reward_std": 0.6315006017684937, + "grad_norm": 6.394073009490967, + "kl": 8.0078125, + "learning_rate": 8.302108601402731e-07, + "loss": 0.5968, + "num_tokens": 688654570.0, + "reward": 1.021484375, + "reward_std": 0.29417431354522705, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.75, - "rewards/format_reward/std": 0.43343618512153625, - "rewards/tag_count_reward/mean": 0.9052734375, - "rewards/tag_count_reward/std": 0.1986045390367508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.21215508878231049, "step": 1048 }, { @@ -30407,27 +30407,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 826.744140625, - "completions/mean_terminated_length": 771.9122314453125, - "completions/min_length": 38.0, - "completions/min_terminated_length": 38.0, + "completions/max_terminated_length": 1788.0, + "completions/mean_length": 855.265625, + "completions/mean_terminated_length": 765.058837890625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.35811214474694886, - "grad_norm": 3.5188894271850586, - "kl": 6.734375, - "learning_rate": 8.29491291753056e-07, - "loss": 0.3553, - "num_tokens": 619348121.0, - "reward": 1.6513671875, - "reward_std": 0.7160419225692749, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.669921875, - "rewards/format_reward/std": 0.47070086002349854, - "rewards/tag_count_reward/mean": 0.8798828125, - "rewards/tag_count_reward/std": 0.22497136890888214, + "grad_norm": 9.915416717529297, + "kl": 7.2265625, + "learning_rate": 8.297910147738446e-07, + "loss": 0.4981, + "num_tokens": 689173458.0, + "reward": 1.06640625, + "reward_std": 0.33647069334983826, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.2031732052564621, "step": 1049 }, { @@ -30436,27 +30436,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 863.94921875, - "completions/mean_terminated_length": 815.8170166015625, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 867.998046875, + "completions/mean_terminated_length": 770.7039794921875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, "epoch": 0.35845352906033967, - "grad_norm": 3.561249256134033, - "kl": 6.609375, - "learning_rate": 8.290709254050403e-07, - "loss": 0.3654, - "num_tokens": 619872239.0, - "reward": 1.58251953125, - "reward_std": 0.6410717368125916, - "rewards/accuracy_reward/mean": 0.030241934582591057, - "rewards/accuracy_reward/std": 0.1714252382516861, - "rewards/format_reward/mean": 0.677734375, - "rewards/format_reward/std": 0.46780112385749817, - "rewards/tag_count_reward/mean": 0.87548828125, - "rewards/tag_count_reward/std": 0.22447973489761353, + "grad_norm": 8.002907752990723, + "kl": 7.453125, + "learning_rate": 8.293707719938891e-07, + "loss": 0.5166, + "num_tokens": 689699649.0, + "reward": 0.90625, + "reward_std": 0.22413699328899384, + "rewards/accuracy_reward/mean": 0.026209676638245583, + "rewards/accuracy_reward/std": 0.1599196493625641, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.880859375, + "rewards/tag_count_reward/std": 0.24121521413326263, "step": 1050 }, { @@ -30465,27 +30465,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 785.029296875, - "completions/mean_terminated_length": 739.0101318359375, - "completions/min_length": 32.0, - "completions/min_terminated_length": 32.0, + "completions/max_terminated_length": 1818.0, + "completions/mean_length": 821.2109375, + "completions/mean_terminated_length": 752.9154663085938, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.3587949133737305, - "grad_norm": 2.92173171043396, - "kl": 6.84375, - "learning_rate": 8.286501629668887e-07, - "loss": 0.3545, - "num_tokens": 620348702.0, - "reward": 1.66162109375, - "reward_std": 0.6889419555664062, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.689453125, - "rewards/format_reward/std": 0.46317005157470703, - "rewards/tag_count_reward/mean": 0.87646484375, - "rewards/tag_count_reward/std": 0.22556257247924805, + "grad_norm": 2.6896049976348877, + "kl": 4.71484375, + "learning_rate": 8.289501323973167e-07, + "loss": 0.3592, + "num_tokens": 690194637.0, + "reward": 1.06689453125, + "reward_std": 0.2854136824607849, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.1905275583267212, "step": 1051 }, { @@ -30494,27 +30494,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 833.7734375, - "completions/mean_terminated_length": 789.5303955078125, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 872.4296875, + "completions/mean_terminated_length": 801.8468017578125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.3591362976871213, - "grad_norm": 1.6500178575515747, - "kl": 6.25, - "learning_rate": 8.282290050357966e-07, - "loss": 0.3758, - "num_tokens": 620848810.0, - "reward": 1.61376953125, - "reward_std": 0.6589176058769226, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.677734375, - "rewards/format_reward/std": 0.46780112385749817, - "rewards/tag_count_reward/mean": 0.88720703125, - "rewards/tag_count_reward/std": 0.21410034596920013, + "grad_norm": 7.166719436645508, + "kl": 5.109375, + "learning_rate": 8.285290965816016e-07, + "loss": 0.4263, + "num_tokens": 690714537.0, + "reward": 0.9775390625, + "reward_std": 0.23442703485488892, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.20515529811382294, "step": 1052 }, { @@ -30523,27 +30523,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 862.044921875, - "completions/mean_terminated_length": 816.3387451171875, - "completions/min_length": 9.0, - "completions/min_terminated_length": 9.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 857.802734375, + "completions/mean_terminated_length": 814.4352416992188, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, "epoch": 0.35947768200051206, - "grad_norm": 3.1046407222747803, - "kl": 5.515625, - "learning_rate": 8.278074522095207e-07, - "loss": 0.3305, - "num_tokens": 621370817.0, - "reward": 1.630859375, - "reward_std": 0.6795388460159302, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.69140625, - "rewards/format_reward/std": 0.4623647928237915, - "rewards/tag_count_reward/mean": 0.8828125, - "rewards/tag_count_reward/std": 0.22160130739212036, + "grad_norm": 4.915225982666016, + "kl": 3.65625, + "learning_rate": 8.281076651447806e-07, + "loss": 0.3051, + "num_tokens": 691234372.0, + "reward": 1.03271484375, + "reward_std": 0.24316367506980896, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.17433759570121765, "step": 1053 }, { @@ -30552,27 +30552,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 877.873046875, - "completions/mean_terminated_length": 827.826904296875, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 1870.0, + "completions/mean_length": 865.78515625, + "completions/mean_terminated_length": 807.6433715820312, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.35981906631390287, - "grad_norm": 2.707383155822754, - "kl": 4.17578125, - "learning_rate": 8.273855050863779e-07, - "loss": 0.2529, - "num_tokens": 621906192.0, - "reward": 1.70751953125, - "reward_std": 0.6367489099502563, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.73828125, - "rewards/format_reward/std": 0.44000017642974854, - "rewards/tag_count_reward/mean": 0.90087890625, - "rewards/tag_count_reward/std": 0.20467674732208252, + "grad_norm": 4.675076961517334, + "kl": 3.66796875, + "learning_rate": 8.276858386854524e-07, + "loss": 0.3293, + "num_tokens": 691763558.0, + "reward": 1.0087890625, + "reward_std": 0.2334781140089035, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.1859828382730484, "step": 1054 }, { @@ -30581,27 +30581,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 807.484375, - "completions/mean_terminated_length": 764.880859375, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 853.79296875, + "completions/mean_terminated_length": 795.0614013671875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, "epoch": 0.3601604506272937, - "grad_norm": 1.5204405784606934, - "kl": 5.0625, - "learning_rate": 8.269631642652454e-07, - "loss": 0.2856, - "num_tokens": 622394088.0, - "reward": 1.72119140625, - "reward_std": 0.6696908473968506, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.728515625, - "rewards/format_reward/std": 0.44516023993492126, - "rewards/tag_count_reward/mean": 0.89892578125, - "rewards/tag_count_reward/std": 0.21195626258850098, + "grad_norm": 5.457221508026123, + "kl": 3.8671875, + "learning_rate": 8.272636178027768e-07, + "loss": 0.3242, + "num_tokens": 692275164.0, + "reward": 1.04296875, + "reward_std": 0.2660578191280365, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.19890500605106354, "step": 1055 }, { @@ -30610,27 +30610,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 826.232421875, - "completions/mean_terminated_length": 779.14599609375, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 1808.0, + "completions/mean_length": 901.771484375, + "completions/mean_terminated_length": 804.6334838867188, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, "epoch": 0.3605018349406845, - "grad_norm": 0.9496316909790039, - "kl": 5.234375, - "learning_rate": 8.265404303455583e-07, - "loss": 0.2785, - "num_tokens": 622892383.0, - "reward": 1.69775390625, - "reward_std": 0.6280603408813477, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.744140625, - "rewards/format_reward/std": 0.43676990270614624, - "rewards/tag_count_reward/mean": 0.90087890625, - "rewards/tag_count_reward/std": 0.20287610590457916, + "grad_norm": 2.8035614490509033, + "kl": 5.24609375, + "learning_rate": 8.268410030964739e-07, + "loss": 0.4161, + "num_tokens": 692812135.0, + "reward": 0.970703125, + "reward_std": 0.25810378789901733, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.21154166758060455, "step": 1056 }, { @@ -30639,27 +30639,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 814.70703125, - "completions/mean_terminated_length": 772.3515625, - "completions/min_length": 46.0, - "completions/min_terminated_length": 46.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 863.29296875, + "completions/mean_terminated_length": 760.1656494140625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.36084321925407525, - "grad_norm": 4.240236282348633, - "kl": 6.4453125, - "learning_rate": 8.261173039273103e-07, - "loss": 0.3098, - "num_tokens": 623380057.0, - "reward": 1.67333984375, - "reward_std": 0.6242455244064331, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.720703125, - "rewards/format_reward/std": 0.44909247756004333, - "rewards/tag_count_reward/mean": 0.89794921875, - "rewards/tag_count_reward/std": 0.20382998883724213, + "grad_norm": 2.4918346405029297, + "kl": 6.2421875, + "learning_rate": 8.264179951668234e-07, + "loss": 0.4906, + "num_tokens": 693324685.0, + "reward": 0.9990234375, + "reward_std": 0.26943477988243103, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.2089213728904724, "step": 1057 }, { @@ -30668,27 +30668,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 845.03515625, - "completions/mean_terminated_length": 803.7212524414062, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 910.142578125, + "completions/mean_terminated_length": 813.7139892578125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.36118460356746607, - "grad_norm": 2.6613004207611084, - "kl": 6.1015625, - "learning_rate": 8.25693785611052e-07, - "loss": 0.3315, - "num_tokens": 623892971.0, - "reward": 1.63720703125, - "reward_std": 0.6636764407157898, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.69921875, - "rewards/format_reward/std": 0.45904624462127686, - "rewards/tag_count_reward/mean": 0.87548828125, - "rewards/tag_count_reward/std": 0.21952125430107117, + "grad_norm": 3.9498162269592285, + "kl": 6.4765625, + "learning_rate": 8.259945946146631e-07, + "loss": 0.507, + "num_tokens": 693870934.0, + "reward": 0.98974609375, + "reward_std": 0.28296419978141785, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90576171875, + "rewards/tag_count_reward/std": 0.2139485478401184, "step": 1058 }, { @@ -30697,27 +30697,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 856.583984375, - "completions/mean_terminated_length": 820.625732421875, - "completions/min_length": 81.0, - "completions/min_terminated_length": 81.0, + "completions/max_terminated_length": 1768.0, + "completions/mean_length": 907.759765625, + "completions/mean_terminated_length": 759.2516479492188, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.3615259878808569, - "grad_norm": 2.4534904956817627, - "kl": 6.5234375, - "learning_rate": 8.2526987599789e-07, - "loss": 0.3675, - "num_tokens": 624397830.0, - "reward": 1.689453125, - "reward_std": 0.6934724450111389, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.716796875, - "rewards/format_reward/std": 0.4509948492050171, - "rewards/tag_count_reward/mean": 0.888671875, - "rewards/tag_count_reward/std": 0.21909481287002563, + "grad_norm": 6.595268249511719, + "kl": 9.4609375, + "learning_rate": 8.255708020413886e-07, + "loss": 0.6893, + "num_tokens": 694401995.0, + "reward": 1.02099609375, + "reward_std": 0.31800708174705505, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89013671875, + "rewards/tag_count_reward/std": 0.2256007045507431, "step": 1059 }, { @@ -30726,27 +30726,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1960.0, - "completions/mean_length": 784.845703125, - "completions/mean_terminated_length": 738.8198852539062, - "completions/min_length": 76.0, - "completions/min_terminated_length": 76.0, + "completions/max_terminated_length": 1722.0, + "completions/mean_length": 884.287109375, + "completions/mean_terminated_length": 747.080810546875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.3618673721942477, - "grad_norm": 1.810727596282959, - "kl": 4.63671875, - "learning_rate": 8.248455756894865e-07, - "loss": 0.2552, - "num_tokens": 624874919.0, - "reward": 1.7666015625, - "reward_std": 0.6343519687652588, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.767578125, - "rewards/format_reward/std": 0.42278963327407837, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.20437131822109222, + "grad_norm": 7.928231239318848, + "kl": 8.59375, + "learning_rate": 8.251466180489526e-07, + "loss": 0.6105, + "num_tokens": 694929998.0, + "reward": 0.98486328125, + "reward_std": 0.2937811613082886, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89111328125, + "rewards/tag_count_reward/std": 0.22225522994995117, "step": 1060 }, { @@ -30755,27 +30755,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 855.83203125, - "completions/mean_terminated_length": 812.3927612304688, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/max_terminated_length": 1901.0, + "completions/mean_length": 916.556640625, + "completions/mean_terminated_length": 769.1942749023438, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, "epoch": 0.36220875650763845, - "grad_norm": 1.5865952968597412, - "kl": 4.54296875, - "learning_rate": 8.244208852880583e-07, - "loss": 0.2585, - "num_tokens": 625403969.0, - "reward": 1.7119140625, - "reward_std": 0.6156963109970093, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.720703125, - "rewards/format_reward/std": 0.44909247756004333, - "rewards/tag_count_reward/mean": 0.8935546875, - "rewards/tag_count_reward/std": 0.203678160905838, + "grad_norm": 8.651107788085938, + "kl": 8.921875, + "learning_rate": 8.247220432398635e-07, + "loss": 0.6441, + "num_tokens": 695490139.0, + "reward": 0.9794921875, + "reward_std": 0.3411746621131897, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8759765625, + "rewards/tag_count_reward/std": 0.24630144238471985, "step": 1061 }, { @@ -30784,27 +30784,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.119140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 742.169921875, - "completions/mean_terminated_length": 716.1574096679688, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 868.87109375, + "completions/mean_terminated_length": 709.3880004882812, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.36255014082102927, - "grad_norm": 1.674841284751892, - "kl": 4.56640625, - "learning_rate": 8.239958053963758e-07, - "loss": 0.2619, - "num_tokens": 625857752.0, - "reward": 1.765625, - "reward_std": 0.5894856452941895, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.7578125, - "rewards/format_reward/std": 0.42882615327835083, - "rewards/tag_count_reward/mean": 0.912109375, - "rewards/tag_count_reward/std": 0.18196536600589752, + "grad_norm": 4.257208824157715, + "kl": 8.625, + "learning_rate": 8.242970782171847e-07, + "loss": 0.6578, + "num_tokens": 696008793.0, + "reward": 0.99365234375, + "reward_std": 0.3104914128780365, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88623046875, + "rewards/tag_count_reward/std": 0.23485609889030457, "step": 1062 }, { @@ -30813,27 +30813,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 682.232421875, - "completions/mean_terminated_length": 666.03759765625, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 1557.0, + "completions/mean_length": 762.447265625, + "completions/mean_terminated_length": 653.5021362304688, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, "epoch": 0.3628915251344201, - "grad_norm": 2.8770751953125, - "kl": 4.38671875, - "learning_rate": 8.23570336617762e-07, - "loss": 0.2604, - "num_tokens": 626275391.0, - "reward": 1.80908203125, - "reward_std": 0.6140027642250061, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, - "rewards/format_reward/mean": 0.7734375, - "rewards/format_reward/std": 0.4190165400505066, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.1775096207857132, + "grad_norm": 5.456188678741455, + "kl": 6.375, + "learning_rate": 8.238717235845342e-07, + "loss": 0.4363, + "num_tokens": 696467502.0, + "reward": 1.044921875, + "reward_std": 0.29651883244514465, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.20177629590034485, "step": 1063 }, { @@ -30842,27 +30842,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 821.57421875, - "completions/mean_terminated_length": 779.45458984375, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 1694.0, + "completions/mean_length": 852.046875, + "completions/mean_terminated_length": 747.9406127929688, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, "epoch": 0.3632329094478109, - "grad_norm": 2.4590604305267334, - "kl": 6.3984375, - "learning_rate": 8.23144479556092e-07, - "loss": 0.3595, - "num_tokens": 626775701.0, - "reward": 1.66259765625, - "reward_std": 0.694244921207428, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.6953125, - "rewards/format_reward/std": 0.4607250988483429, - "rewards/tag_count_reward/mean": 0.87939453125, - "rewards/tag_count_reward/std": 0.21947772800922394, + "grad_norm": 3.4516074657440186, + "kl": 5.38671875, + "learning_rate": 8.234459799460834e-07, + "loss": 0.4415, + "num_tokens": 696983414.0, + "reward": 0.9951171875, + "reward_std": 0.28641411662101746, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8896484375, + "rewards/tag_count_reward/std": 0.22454623878002167, "step": 1064 }, { @@ -30871,27 +30871,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 817.7109375, - "completions/mean_terminated_length": 778.024169921875, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 1603.0, + "completions/mean_length": 885.55859375, + "completions/mean_terminated_length": 765.3060302734375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.36357429376120165, - "grad_norm": 1.5840237140655518, - "kl": 5.2578125, - "learning_rate": 8.227182348157923e-07, - "loss": 0.2916, - "num_tokens": 627268497.0, - "reward": 1.716796875, - "reward_std": 0.6501511335372925, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.748046875, - "rewards/format_reward/std": 0.43455907702445984, - "rewards/tag_count_reward/mean": 0.89453125, - "rewards/tag_count_reward/std": 0.20774950087070465, + "grad_norm": 5.238219738006592, + "kl": 5.59765625, + "learning_rate": 8.230198479065557e-07, + "loss": 0.442, + "num_tokens": 697510948.0, + "reward": 0.95947265625, + "reward_std": 0.31128498911857605, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88134765625, + "rewards/tag_count_reward/std": 0.24472665786743164, "step": 1065 }, { @@ -30900,27 +30900,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 790.078125, - "completions/mean_terminated_length": 765.0199584960938, - "completions/min_length": 196.0, - "completions/min_terminated_length": 196.0, + "completions/max_terminated_length": 1956.0, + "completions/mean_length": 832.685546875, + "completions/mean_terminated_length": 732.4799194335938, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.36391567807459246, - "grad_norm": 2.5225985050201416, - "kl": 5.8828125, - "learning_rate": 8.222916030018389e-07, - "loss": 0.3334, - "num_tokens": 627744697.0, - "reward": 1.7060546875, - "reward_std": 0.6291856169700623, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.724609375, - "rewards/format_reward/std": 0.44714778661727905, - "rewards/tag_count_reward/mean": 0.8876953125, - "rewards/tag_count_reward/std": 0.2082800716161728, + "grad_norm": 7.5198163986206055, + "kl": 5.1640625, + "learning_rate": 8.22593328071227e-07, + "loss": 0.4354, + "num_tokens": 698008963.0, + "reward": 1.0634765625, + "reward_std": 0.3327295780181885, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9013671875, + "rewards/tag_count_reward/std": 0.22834400832653046, "step": 1066 }, { @@ -30929,27 +30929,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1969.0, - "completions/mean_length": 772.548828125, - "completions/mean_terminated_length": 739.3206787109375, - "completions/min_length": 36.0, - "completions/min_terminated_length": 36.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 843.572265625, + "completions/mean_terminated_length": 752.4811401367188, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.3642570623879833, - "grad_norm": 1.46543288230896, - "kl": 4.73828125, - "learning_rate": 8.21864584719758e-07, - "loss": 0.254, - "num_tokens": 628220450.0, - "reward": 1.75927734375, - "reward_std": 0.6168092489242554, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.779296875, - "rewards/format_reward/std": 0.4151262938976288, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.19466042518615723, + "grad_norm": 9.311989784240723, + "kl": 6.1640625, + "learning_rate": 8.221664210459234e-07, + "loss": 0.5214, + "num_tokens": 698521080.0, + "reward": 0.974609375, + "reward_std": 0.2974643111228943, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.888671875, + "rewards/tag_count_reward/std": 0.2331579178571701, "step": 1067 }, { @@ -30958,27 +30958,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 822.328125, - "completions/mean_terminated_length": 785.3359985351562, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 875.27734375, + "completions/mean_terminated_length": 775.89404296875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.3645984467013741, - "grad_norm": 1.9685052633285522, - "kl": 4.9921875, - "learning_rate": 8.214371805756238e-07, - "loss": 0.2502, - "num_tokens": 628718378.0, - "reward": 1.7216796875, - "reward_std": 0.6598231196403503, - "rewards/accuracy_reward/mean": 0.10282257944345474, - "rewards/accuracy_reward/std": 0.30403366684913635, - "rewards/format_reward/mean": 0.732421875, - "rewards/format_reward/std": 0.4431293308734894, - "rewards/tag_count_reward/mean": 0.8896484375, - "rewards/tag_count_reward/std": 0.21451778709888458, + "grad_norm": 3.06099796295166, + "kl": 6.734375, + "learning_rate": 8.217391274370209e-07, + "loss": 0.5081, + "num_tokens": 699046118.0, + "reward": 0.99462890625, + "reward_std": 0.31980249285697937, + "rewards/accuracy_reward/mean": 0.11290322244167328, + "rewards/accuracy_reward/std": 0.3167939782142639, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88525390625, + "rewards/tag_count_reward/std": 0.2343795895576477, "step": 1068 }, { @@ -30987,27 +30987,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 888.525390625, - "completions/mean_terminated_length": 836.46728515625, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1908.0, + "completions/mean_length": 889.423828125, + "completions/mean_terminated_length": 788.5711669921875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.36493983101476485, - "grad_norm": 2.010416269302368, - "kl": 4.625, - "learning_rate": 8.210093911760582e-07, - "loss": 0.2752, - "num_tokens": 629254487.0, - "reward": 1.66357421875, - "reward_std": 0.6304594874382019, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.720703125, - "rewards/format_reward/std": 0.44909247756004333, - "rewards/tag_count_reward/mean": 0.89404296875, - "rewards/tag_count_reward/std": 0.20121611654758453, + "grad_norm": 3.763425350189209, + "kl": 7.203125, + "learning_rate": 8.213114478514453e-07, + "loss": 0.5424, + "num_tokens": 699582687.0, + "reward": 0.91064453125, + "reward_std": 0.2960495054721832, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86962890625, + "rewards/tag_count_reward/std": 0.251162588596344, "step": 1069 }, { @@ -31016,27 +31016,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 770.421875, - "completions/mean_terminated_length": 744.9721069335938, - "completions/min_length": 56.0, - "completions/min_terminated_length": 56.0, + "completions/max_terminated_length": 1761.0, + "completions/mean_length": 812.064453125, + "completions/mean_terminated_length": 712.98095703125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.36528121532815566, - "grad_norm": 1.9328861236572266, - "kl": 4.06640625, - "learning_rate": 8.205812171282302e-07, - "loss": 0.2321, - "num_tokens": 629727439.0, - "reward": 1.7705078125, - "reward_std": 0.5791828036308289, - "rewards/accuracy_reward/mean": 0.07459677755832672, - "rewards/accuracy_reward/std": 0.263004869222641, - "rewards/format_reward/mean": 0.78125, - "rewards/format_reward/std": 0.41380295157432556, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.18883807957172394, + "grad_norm": 3.755972146987915, + "kl": 8.65625, + "learning_rate": 8.208833828966698e-07, + "loss": 0.6401, + "num_tokens": 700076960.0, + "reward": 0.9345703125, + "reward_std": 0.3185652792453766, + "rewards/accuracy_reward/mean": 0.06653226166963577, + "rewards/accuracy_reward/std": 0.2494617998600006, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8701171875, + "rewards/tag_count_reward/std": 0.25455841422080994, "step": 1070 }, { @@ -31045,27 +31045,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 850.46875, - "completions/mean_terminated_length": 809.3414306640625, - "completions/min_length": 93.0, - "completions/min_terminated_length": 93.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 931.380859375, + "completions/mean_terminated_length": 760.3671264648438, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.3656225996415465, - "grad_norm": 1.2636034488677979, - "kl": 4.890625, - "learning_rate": 8.201526590398543e-07, - "loss": 0.2682, - "num_tokens": 630247391.0, - "reward": 1.70361328125, - "reward_std": 0.6831998825073242, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.724609375, - "rewards/format_reward/std": 0.44714778661727905, - "rewards/tag_count_reward/mean": 0.88525390625, - "rewards/tag_count_reward/std": 0.22039444744586945, + "grad_norm": 13.6011323928833, + "kl": 11.875, + "learning_rate": 8.204549331807157e-07, + "loss": 0.8274, + "num_tokens": 700638339.0, + "reward": 0.93310546875, + "reward_std": 0.3562435209751129, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83544921875, + "rewards/tag_count_reward/std": 0.2760820984840393, "step": 1071 }, { @@ -31074,27 +31074,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 868.84765625, - "completions/mean_terminated_length": 820.9146118164062, - "completions/min_length": 28.0, - "completions/min_terminated_length": 28.0, + "completions/max_terminated_length": 1788.0, + "completions/mean_length": 925.4140625, + "completions/mean_terminated_length": 779.205322265625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.3659639839549373, - "grad_norm": 1.3227003812789917, - "kl": 5.53125, - "learning_rate": 8.197237175191907e-07, - "loss": 0.2956, - "num_tokens": 630769969.0, - "reward": 1.7021484375, - "reward_std": 0.633798360824585, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.732421875, - "rewards/format_reward/std": 0.4431293308734894, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.2121168076992035, + "grad_norm": 7.684846878051758, + "kl": 11.171875, + "learning_rate": 8.200260993121503e-07, + "loss": 0.7788, + "num_tokens": 701189879.0, + "reward": 0.90087890625, + "reward_std": 0.3302449882030487, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.84033203125, + "rewards/tag_count_reward/std": 0.2645368278026581, "step": 1072 }, { @@ -31103,27 +31103,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 778.013671875, - "completions/mean_terminated_length": 768.0137939453125, - "completions/min_length": 32.0, - "completions/min_terminated_length": 32.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 823.134765625, + "completions/mean_terminated_length": 699.3311767578125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.36630536826832805, - "grad_norm": 1.466387152671814, - "kl": 4.18359375, - "learning_rate": 8.192943931750431e-07, - "loss": 0.2304, - "num_tokens": 631249336.0, - "reward": 1.73974609375, - "reward_std": 0.5979659557342529, - "rewards/accuracy_reward/mean": 0.08669354766607285, - "rewards/accuracy_reward/std": 0.281669557094574, - "rewards/format_reward/mean": 0.74609375, - "rewards/format_reward/std": 0.43567025661468506, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.19413939118385315, + "grad_norm": 4.033299922943115, + "kl": 7.6953125, + "learning_rate": 8.195968819000867e-07, + "loss": 0.5581, + "num_tokens": 701692348.0, + "reward": 0.99267578125, + "reward_std": 0.32155275344848633, + "rewards/accuracy_reward/mean": 0.1088709682226181, + "rewards/accuracy_reward/std": 0.31179171800613403, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88720703125, + "rewards/tag_count_reward/std": 0.23688167333602905, "step": 1073 }, { @@ -31132,27 +31132,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.12890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1942.0, - "completions/mean_length": 793.66015625, - "completions/mean_terminated_length": 768.67333984375, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, - "epoch": 0.36664675258171886, - "grad_norm": 1.0847512483596802, - "kl": 5.2890625, - "learning_rate": 8.188646866167591e-07, - "loss": 0.2889, - "num_tokens": 631726858.0, - "reward": 1.75, - "reward_std": 0.6973233819007874, - "rewards/accuracy_reward/mean": 0.11290322244167328, - "rewards/accuracy_reward/std": 0.3167939782142639, - "rewards/format_reward/mean": 0.748046875, - "rewards/format_reward/std": 0.43455907702445984, - "rewards/tag_count_reward/mean": 0.892578125, - "rewards/tag_count_reward/std": 0.21486715972423553, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 912.025390625, + "completions/mean_terminated_length": 743.9215698242188, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.36664675258171886, + "grad_norm": 6.180331230163574, + "kl": 10.46875, + "learning_rate": 8.191672815541827e-07, + "loss": 0.7542, + "num_tokens": 702230473.0, + "reward": 0.96044921875, + "reward_std": 0.3685113489627838, + "rewards/accuracy_reward/mean": 0.13306452333927155, + "rewards/accuracy_reward/std": 0.3399873375892639, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83154296875, + "rewards/tag_count_reward/std": 0.27946019172668457, "step": 1074 }, { @@ -31161,27 +31161,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1970.0, - "completions/mean_length": 803.58203125, - "completions/mean_terminated_length": 778.7928466796875, - "completions/min_length": 41.0, - "completions/min_terminated_length": 41.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 860.146484375, + "completions/mean_terminated_length": 705.4370727539062, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.3669881368951097, - "grad_norm": 1.3949421644210815, - "kl": 4.5703125, - "learning_rate": 8.184345984542283e-07, - "loss": 0.2707, - "num_tokens": 632215252.0, - "reward": 1.76611328125, - "reward_std": 0.6225503087043762, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.759765625, - "rewards/format_reward/std": 0.4276435375213623, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.19206106662750244, + "grad_norm": 4.411325454711914, + "kl": 8.0859375, + "learning_rate": 8.187372988846406e-07, + "loss": 0.637, + "num_tokens": 702747828.0, + "reward": 0.93505859375, + "reward_std": 0.34748780727386475, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.84912109375, + "rewards/tag_count_reward/std": 0.2705618739128113, "step": 1075 }, { @@ -31190,27 +31190,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.111328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 756.611328125, - "completions/mean_terminated_length": 733.5049438476562, - "completions/min_length": 9.0, - "completions/min_terminated_length": 9.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 859.63671875, + "completions/mean_terminated_length": 710.764892578125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.3673295212085005, - "grad_norm": 1.2364869117736816, - "kl": 5.0, - "learning_rate": 8.180041292978826e-07, - "loss": 0.2846, - "num_tokens": 632681005.0, - "reward": 1.79443359375, - "reward_std": 0.6380493640899658, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, - "rewards/format_reward/mean": 0.76171875, - "rewards/format_reward/std": 0.42644867300987244, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.19944652915000916, + "grad_norm": 5.4574360847473145, + "kl": 7.21875, + "learning_rate": 8.183069345022047e-07, + "loss": 0.5797, + "num_tokens": 703266330.0, + "reward": 0.96533203125, + "reward_std": 0.36942243576049805, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.84619140625, + "rewards/tag_count_reward/std": 0.264778733253479, "step": 1076 }, { @@ -31219,27 +31219,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.154296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, - "completions/mean_length": 790.45703125, - "completions/mean_terminated_length": 767.9562377929688, - "completions/min_length": 8.0, - "completions/min_terminated_length": 8.0, + "completions/mean_length": 911.302734375, + "completions/mean_terminated_length": 703.91455078125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, "epoch": 0.36767090552189124, - "grad_norm": 1.0173770189285278, - "kl": 5.1875, - "learning_rate": 8.175732797586939e-07, - "loss": 0.2963, - "num_tokens": 633153223.0, - "reward": 1.712890625, - "reward_std": 0.6708530783653259, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.74609375, - "rewards/format_reward/std": 0.43567025661468506, - "rewards/tag_count_reward/mean": 0.892578125, - "rewards/tag_count_reward/std": 0.21656812727451324, + "grad_norm": 11.526627540588379, + "kl": 7.6328125, + "learning_rate": 8.178761890181624e-07, + "loss": 0.6923, + "num_tokens": 703800421.0, + "reward": 0.92431640625, + "reward_std": 0.35974210500717163, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.82080078125, + "rewards/tag_count_reward/std": 0.28922587633132935, "step": 1077 }, { @@ -31248,27 +31248,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 821.943359375, - "completions/mean_terminated_length": 795.02392578125, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 911.83203125, + "completions/mean_terminated_length": 755.2933349609375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, "epoch": 0.36801228983528206, - "grad_norm": 1.6971614360809326, - "kl": 5.8671875, - "learning_rate": 8.171420504481743e-07, - "loss": 0.3324, - "num_tokens": 633641386.0, - "reward": 1.6669921875, - "reward_std": 0.6330738067626953, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.7265625, - "rewards/format_reward/std": 0.4461594223976135, - "rewards/tag_count_reward/mean": 0.8994140625, - "rewards/tag_count_reward/std": 0.19879689812660217, + "grad_norm": 16.47621726989746, + "kl": 6.8984375, + "learning_rate": 8.174450630443423e-07, + "loss": 0.6225, + "num_tokens": 704334607.0, + "reward": 0.8896484375, + "reward_std": 0.32641637325286865, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.2029850035905838, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8466796875, + "rewards/tag_count_reward/std": 0.27167075872421265, "step": 1078 }, { @@ -31277,27 +31277,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.154296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 785.74609375, - "completions/mean_terminated_length": 729.0734252929688, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 901.01953125, + "completions/mean_terminated_length": 691.7551879882812, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.36835367414867287, - "grad_norm": 1.5560015439987183, - "kl": 6.8671875, - "learning_rate": 8.167104419783753e-07, - "loss": 0.4281, - "num_tokens": 634127256.0, - "reward": 1.7783203125, - "reward_std": 0.678869366645813, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, - "rewards/format_reward/mean": 0.7578125, - "rewards/format_reward/std": 0.42882615327835083, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.22225362062454224, + "grad_norm": 6.549627304077148, + "kl": 7.34375, + "learning_rate": 8.170135571931125e-07, + "loss": 0.6046, + "num_tokens": 704879497.0, + "reward": 0.9775390625, + "reward_std": 0.35753077268600464, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8310546875, + "rewards/tag_count_reward/std": 0.2806939482688904, "step": 1079 }, { @@ -31306,27 +31306,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.181640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 809.8359375, - "completions/mean_terminated_length": 787.681884765625, - "completions/min_length": 74.0, - "completions/min_terminated_length": 74.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 984.263671875, + "completions/mean_terminated_length": 748.159912109375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, "epoch": 0.3686950584620637, - "grad_norm": 2.276832342147827, - "kl": 5.34375, - "learning_rate": 8.162784549618855e-07, - "loss": 0.2815, - "num_tokens": 634623476.0, - "reward": 1.6962890625, - "reward_std": 0.649118185043335, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.728515625, - "rewards/format_reward/std": 0.44516023993492126, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.2050807625055313, + "grad_norm": 5.982292652130127, + "kl": 8.5234375, + "learning_rate": 8.165816720773819e-07, + "loss": 0.6862, + "num_tokens": 705465024.0, + "reward": 0.90625, + "reward_std": 0.3280293345451355, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.833984375, + "rewards/tag_count_reward/std": 0.27542415261268616, "step": 1080 }, { @@ -31335,27 +31335,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.185546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1952.0, - "completions/mean_length": 822.41796875, - "completions/mean_terminated_length": 787.9638061523438, - "completions/min_length": 222.0, - "completions/min_terminated_length": 222.0, + "completions/max_terminated_length": 1559.0, + "completions/mean_length": 967.390625, + "completions/mean_terminated_length": 721.2086791992188, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.36903644277545444, - "grad_norm": 2.9448065757751465, - "kl": 4.58984375, - "learning_rate": 8.158460900118321e-07, - "loss": 0.3172, - "num_tokens": 635122042.0, - "reward": 1.77880859375, - "reward_std": 0.5868085622787476, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.7890625, - "rewards/format_reward/std": 0.4083731174468994, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.1838558167219162, + "grad_norm": 451.319091796875, + "kl": 20.484375, + "learning_rate": 8.161494083105976e-07, + "loss": 1.1782, + "num_tokens": 706037816.0, + "reward": 0.8984375, + "reward_std": 0.33728212118148804, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.837890625, + "rewards/tag_count_reward/std": 0.2618798613548279, "step": 1081 }, { @@ -31364,27 +31364,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.19140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 879.923828125, - "completions/mean_terminated_length": 817.4341430664062, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/max_terminated_length": 1837.0, + "completions/mean_length": 978.1328125, + "completions/mean_terminated_length": 724.8792114257812, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.36937782708884526, - "grad_norm": 1.9693816900253296, - "kl": 5.25, - "learning_rate": 8.15413347741878e-07, - "loss": 0.3493, - "num_tokens": 635643347.0, - "reward": 1.67431640625, - "reward_std": 0.6367882490158081, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.736328125, - "rewards/format_reward/std": 0.4410543739795685, - "rewards/tag_count_reward/mean": 0.88916015625, - "rewards/tag_count_reward/std": 0.220177561044693, + "grad_norm": 11.029499053955078, + "kl": 11.375, + "learning_rate": 8.157167665067446e-07, + "loss": 0.8216, + "num_tokens": 706609404.0, + "reward": 0.88720703125, + "reward_std": 0.3291788697242737, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83642578125, + "rewards/tag_count_reward/std": 0.2667597830295563, "step": 1082 }, { @@ -31393,27 +31393,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.228515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 871.9296875, - "completions/mean_terminated_length": 824.1219482421875, - "completions/min_length": 207.0, - "completions/min_terminated_length": 207.0, + "completions/max_terminated_length": 1951.0, + "completions/mean_length": 1037.57421875, + "completions/mean_terminated_length": 738.2835693359375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.36971921140223607, - "grad_norm": 1.6949392557144165, - "kl": 5.15625, - "learning_rate": 8.149802287662214e-07, - "loss": 0.3026, - "num_tokens": 636168431.0, - "reward": 1.6943359375, - "reward_std": 0.6163403987884521, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.71875, - "rewards/format_reward/std": 0.45004892349243164, - "rewards/tag_count_reward/mean": 0.8955078125, - "rewards/tag_count_reward/std": 0.2088298797607422, + "grad_norm": 14.896235466003418, + "kl": 12.5, + "learning_rate": 8.152837472803445e-07, + "loss": 0.8799, + "num_tokens": 707219298.0, + "reward": 0.88037109375, + "reward_std": 0.31797248125076294, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.81396484375, + "rewards/tag_count_reward/std": 0.28184691071510315, "step": 1083 }, { @@ -31422,27 +31422,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.150390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 777.044921875, - "completions/mean_terminated_length": 754.3041381835938, - "completions/min_length": 40.0, - "completions/min_terminated_length": 40.0, + "completions/max_terminated_length": 1526.0, + "completions/mean_length": 834.72265625, + "completions/mean_terminated_length": 619.9586181640625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, "epoch": 0.3700605957156269, - "grad_norm": 0.945247232913971, - "kl": 5.203125, - "learning_rate": 8.145467336995954e-07, - "loss": 0.2983, - "num_tokens": 636637718.0, - "reward": 1.8046875, - "reward_std": 0.6311696767807007, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, - "rewards/format_reward/mean": 0.77734375, - "rewards/format_reward/std": 0.41643625497817993, - "rewards/tag_count_reward/mean": 0.904296875, - "rewards/tag_count_reward/std": 0.20836491882801056, + "grad_norm": 6.166479110717773, + "kl": 10.171875, + "learning_rate": 8.148503512464555e-07, + "loss": 0.7846, + "num_tokens": 707718116.0, + "reward": 1.01513671875, + "reward_std": 0.36273396015167236, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88427734375, + "rewards/tag_count_reward/std": 0.23337452113628387, "step": 1084 }, { @@ -31451,27 +31451,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.162109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 837.783203125, - "completions/mean_terminated_length": 806.2545166015625, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/max_terminated_length": 1648.0, + "completions/mean_length": 936.90234375, + "completions/mean_terminated_length": 721.9347534179688, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.37040198002901764, - "grad_norm": 1.0525614023208618, - "kl": 5.484375, - "learning_rate": 8.141128631572676e-07, - "loss": 0.3226, - "num_tokens": 637144727.0, - "reward": 1.67578125, - "reward_std": 0.6332330703735352, - "rewards/accuracy_reward/mean": 0.0520833320915699, - "rewards/accuracy_reward/std": 0.2224269062280655, - "rewards/format_reward/mean": 0.736328125, - "rewards/format_reward/std": 0.4410543739795685, - "rewards/tag_count_reward/mean": 0.890625, - "rewards/tag_count_reward/std": 0.22338789701461792, + "grad_norm": 4.932461738586426, + "kl": 9.6640625, + "learning_rate": 8.144165790206708e-07, + "loss": 0.7423, + "num_tokens": 708275874.0, + "reward": 0.9462890625, + "reward_std": 0.31739968061447144, + "rewards/accuracy_reward/mean": 0.0833333358168602, + "rewards/accuracy_reward/std": 0.2766737639904022, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8681640625, + "rewards/tag_count_reward/std": 0.2457110583782196, "step": 1085 }, { @@ -31480,27 +31480,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.1796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 905.400390625, - "completions/mean_terminated_length": 873.2791137695312, - "completions/min_length": 215.0, - "completions/min_terminated_length": 215.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 984.498046875, + "completions/mean_terminated_length": 751.54052734375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.37074336434240845, - "grad_norm": 0.8585146069526672, - "kl": 6.1640625, - "learning_rate": 8.136786177550373e-07, - "loss": 0.3761, - "num_tokens": 637686452.0, - "reward": 1.7041015625, - "reward_std": 0.6443427205085754, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.75390625, - "rewards/format_reward/std": 0.4311550557613373, - "rewards/tag_count_reward/mean": 0.8935546875, - "rewards/tag_count_reward/std": 0.22206437587738037, + "grad_norm": 3.875244379043579, + "kl": 9.125, + "learning_rate": 8.139824312191178e-07, + "loss": 0.6913, + "num_tokens": 708858097.0, + "reward": 0.91796875, + "reward_std": 0.3075498342514038, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.857421875, + "rewards/tag_count_reward/std": 0.25060316920280457, "step": 1086 }, { @@ -31509,27 +31509,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.134765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1970.0, - "completions/mean_length": 863.5859375, - "completions/mean_terminated_length": 825.3790283203125, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 1763.0, + "completions/mean_length": 929.806640625, + "completions/mean_terminated_length": 755.64111328125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.37108474865579927, - "grad_norm": 0.929337203502655, - "kl": 4.8125, - "learning_rate": 8.132439981092364e-07, - "loss": 0.2587, - "num_tokens": 638208752.0, - "reward": 1.78369140625, - "reward_std": 0.6057051420211792, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.7890625, - "rewards/format_reward/std": 0.4083731174468994, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.201870396733284, + "grad_norm": 7.4983720779418945, + "kl": 7.28125, + "learning_rate": 8.135479084584576e-07, + "loss": 0.6093, + "num_tokens": 709414302.0, + "reward": 0.986328125, + "reward_std": 0.2826131582260132, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.22104869782924652, "step": 1087 }, { @@ -31538,27 +31538,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.142578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 924.69921875, - "completions/mean_terminated_length": 879.0365600585938, - "completions/min_length": 203.0, - "completions/min_terminated_length": 203.0, + "completions/max_terminated_length": 1670.0, + "completions/mean_length": 972.50390625, + "completions/mean_terminated_length": 793.6629028320312, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.3714261329691901, - "grad_norm": 1.6980890035629272, - "kl": 4.96875, - "learning_rate": 8.128090048367283e-07, - "loss": 0.3049, - "num_tokens": 638757366.0, - "reward": 1.76806640625, - "reward_std": 0.602573573589325, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.787109375, - "rewards/format_reward/std": 0.409751296043396, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.20380185544490814, + "grad_norm": 7.247858047485352, + "kl": 7.03125, + "learning_rate": 8.131130113558837e-07, + "loss": 0.5686, + "num_tokens": 709987392.0, + "reward": 0.9501953125, + "reward_std": 0.3110334873199463, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8876953125, + "rewards/tag_count_reward/std": 0.22575154900550842, "step": 1088 }, { @@ -31567,27 +31567,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.130859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1916.0, - "completions/mean_length": 821.224609375, - "completions/mean_terminated_length": 791.7820434570312, - "completions/min_length": 226.0, - "completions/min_terminated_length": 226.0, + "completions/max_terminated_length": 1877.0, + "completions/mean_length": 905.70703125, + "completions/mean_terminated_length": 733.7213745117188, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.37176751728258084, - "grad_norm": 1.3999513387680054, - "kl": 5.125, - "learning_rate": 8.123736385549063e-07, - "loss": 0.3134, - "num_tokens": 639254041.0, - "reward": 1.79443359375, - "reward_std": 0.5958268642425537, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.7890625, - "rewards/format_reward/std": 0.4083731174468994, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.1914980411529541, + "grad_norm": 4.76137638092041, + "kl": 7.765625, + "learning_rate": 8.126777405291217e-07, + "loss": 0.6249, + "num_tokens": 710527322.0, + "reward": 0.97314453125, + "reward_std": 0.27601850032806396, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89697265625, + "rewards/tag_count_reward/std": 0.21953432261943817, "step": 1089 }, { @@ -31596,27 +31596,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 931.2734375, - "completions/mean_terminated_length": 888.2352905273438, - "completions/min_length": 85.0, - "completions/min_terminated_length": 85.0, + "completions/max_terminated_length": 1637.0, + "completions/mean_length": 989.146484375, + "completions/mean_terminated_length": 778.36767578125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.37210890159597165, - "grad_norm": 2.167673349380493, - "kl": 6.609375, - "learning_rate": 8.119378998816932e-07, - "loss": 0.3793, - "num_tokens": 639805157.0, - "reward": 1.75927734375, - "reward_std": 0.6634529232978821, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, - "rewards/format_reward/mean": 0.74609375, - "rewards/format_reward/std": 0.43567025661468506, - "rewards/tag_count_reward/mean": 0.88818359375, - "rewards/tag_count_reward/std": 0.22734324634075165, + "grad_norm": 7.1717305183410645, + "kl": 10.453125, + "learning_rate": 8.122420965964274e-07, + "loss": 0.7534, + "num_tokens": 711108069.0, + "reward": 1.00341796875, + "reward_std": 0.34246936440467834, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.85888671875, + "rewards/tag_count_reward/std": 0.2540939450263977, "step": 1090 }, { @@ -31625,27 +31625,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.119140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 860.47265625, - "completions/mean_terminated_length": 839.224609375, - "completions/min_length": 265.0, - "completions/min_terminated_length": 265.0, + "completions/max_terminated_length": 1646.0, + "completions/mean_length": 926.458984375, + "completions/mean_terminated_length": 774.7649536132812, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, "epoch": 0.37245028590936247, - "grad_norm": 0.7005177140235901, - "kl": 4.20703125, - "learning_rate": 8.115017894355401e-07, - "loss": 0.2119, - "num_tokens": 640327143.0, - "reward": 1.7822265625, - "reward_std": 0.571142852306366, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.787109375, - "rewards/format_reward/std": 0.409751296043396, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.1916109025478363, + "grad_norm": 5.09678316116333, + "kl": 7.7890625, + "learning_rate": 8.11806080176587e-07, + "loss": 0.5947, + "num_tokens": 711663840.0, + "reward": 0.9853515625, + "reward_std": 0.2697702944278717, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9052734375, + "rewards/tag_count_reward/std": 0.20105281472206116, "step": 1091 }, { @@ -31654,27 +31654,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 795.486328125, - "completions/mean_terminated_length": 767.9860229492188, - "completions/min_length": 211.0, - "completions/min_terminated_length": 211.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 841.29296875, + "completions/mean_terminated_length": 684.1280517578125, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.3727916702227533, - "grad_norm": 0.9756630659103394, - "kl": 4.40234375, - "learning_rate": 8.110653078354264e-07, - "loss": 0.2466, - "num_tokens": 640818288.0, - "reward": 1.75048828125, - "reward_std": 0.5791925191879272, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.7890625, - "rewards/format_reward/std": 0.4083731174468994, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.19634176790714264, + "grad_norm": 6.808550834655762, + "kl": 9.1484375, + "learning_rate": 8.113696918889159e-07, + "loss": 0.6749, + "num_tokens": 712178438.0, + "reward": 1.00927734375, + "reward_std": 0.2739403545856476, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.21818961203098297, "step": 1092 }, { @@ -31683,27 +31683,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.123046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 886.421875, - "completions/mean_terminated_length": 824.27978515625, - "completions/min_length": 85.0, - "completions/min_terminated_length": 85.0, + "completions/max_terminated_length": 1799.0, + "completions/mean_length": 913.9765625, + "completions/mean_terminated_length": 754.8596801757812, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.37313305453614404, - "grad_norm": 1.5486931800842285, - "kl": 6.171875, - "learning_rate": 8.106284557008577e-07, - "loss": 0.3443, - "num_tokens": 641345560.0, - "reward": 1.70458984375, - "reward_std": 0.6492677927017212, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.744140625, - "rewards/format_reward/std": 0.43676990270614624, - "rewards/tag_count_reward/mean": 0.88818359375, - "rewards/tag_count_reward/std": 0.22626470029354095, + "grad_norm": 5.173513412475586, + "kl": 8.1640625, + "learning_rate": 8.109329323532572e-07, + "loss": 0.581, + "num_tokens": 712719818.0, + "reward": 0.974609375, + "reward_std": 0.2698734402656555, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.892578125, + "rewards/tag_count_reward/std": 0.22269386053085327, "step": 1093 }, { @@ -31712,26 +31712,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 866.28125, - "completions/mean_terminated_length": 825.697021484375, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 869.73046875, + "completions/mean_terminated_length": 753.4205932617188, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, "epoch": 0.37347443884953485, - "grad_norm": 1.7973084449768066, - "kl": 5.1953125, - "learning_rate": 8.101912336518656e-07, - "loss": 0.3019, - "num_tokens": 641866584.0, - "reward": 1.755859375, - "reward_std": 0.6487219929695129, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.765625, - "rewards/format_reward/std": 0.42402184009552, - "rewards/tag_count_reward/mean": 0.904296875, + "grad_norm": 4.9021124839782715, + "kl": 5.953125, + "learning_rate": 8.104958021899817e-07, + "loss": 0.4568, + "num_tokens": 713242608.0, + "reward": 1.001953125, + "reward_std": 0.2680632770061493, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.908203125, "rewards/tag_count_reward/std": 0.20718760788440704, "step": 1094 }, @@ -31741,27 +31741,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1851.0, - "completions/mean_length": 911.845703125, - "completions/mean_terminated_length": 865.6605224609375, - "completions/min_length": 292.0, - "completions/min_terminated_length": 292.0, + "completions/max_terminated_length": 1928.0, + "completions/mean_length": 983.716796875, + "completions/mean_terminated_length": 786.6273193359375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.37381582316292566, - "grad_norm": 1.6846357583999634, - "kl": 4.26171875, - "learning_rate": 8.097536423090072e-07, - "loss": 0.2737, - "num_tokens": 642411369.0, - "reward": 1.79931640625, - "reward_std": 0.5785388946533203, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.779296875, - "rewards/format_reward/std": 0.4151262938976288, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.18756051361560822, + "grad_norm": 3.8479092121124268, + "kl": 9.3515625, + "learning_rate": 8.100583020199867e-07, + "loss": 0.6675, + "num_tokens": 713824191.0, + "reward": 0.95654296875, + "reward_std": 0.3160579800605774, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87255859375, + "rewards/tag_count_reward/std": 0.23666778206825256, "step": 1095 }, { @@ -31770,27 +31770,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1925.0, - "completions/mean_length": 845.501953125, - "completions/mean_terminated_length": 786.3626708984375, - "completions/min_length": 19.0, - "completions/min_terminated_length": 19.0, + "completions/max_terminated_length": 1398.0, + "completions/mean_length": 871.8359375, + "completions/mean_terminated_length": 709.7866821289062, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.3741572074763165, - "grad_norm": 1.0550159215927124, - "kl": 6.0703125, - "learning_rate": 8.09315682293363e-07, - "loss": 0.3591, - "num_tokens": 642922058.0, - "reward": 1.71044921875, - "reward_std": 0.6264170408248901, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.759765625, - "rewards/format_reward/std": 0.4276435375213623, - "rewards/tag_count_reward/mean": 0.89208984375, - "rewards/tag_count_reward/std": 0.22491775453090668, + "grad_norm": 10.697131156921387, + "kl": 7.203125, + "learning_rate": 8.096204324646946e-07, + "loss": 0.5975, + "num_tokens": 714348363.0, + "reward": 0.95654296875, + "reward_std": 0.285021036863327, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88818359375, + "rewards/tag_count_reward/std": 0.23939752578735352, "step": 1096 }, { @@ -31799,27 +31799,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.11328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 874.115234375, - "completions/mean_terminated_length": 831.3421020507812, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 1933.0, + "completions/mean_length": 903.138671875, + "completions/mean_terminated_length": 756.8788452148438, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.37449859178970724, - "grad_norm": 1.7731609344482422, - "kl": 6.234375, - "learning_rate": 8.088773542265372e-07, - "loss": 0.365, - "num_tokens": 643437077.0, - "reward": 1.77880859375, - "reward_std": 0.625165581703186, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.771484375, - "rewards/format_reward/std": 0.4202871024608612, - "rewards/tag_count_reward/mean": 0.89794921875, - "rewards/tag_count_reward/std": 0.21887609362602234, + "grad_norm": 11.861581802368164, + "kl": 6.46875, + "learning_rate": 8.091821941460532e-07, + "loss": 0.5427, + "num_tokens": 714878242.0, + "reward": 1.0185546875, + "reward_std": 0.29605746269226074, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8857421875, + "rewards/tag_count_reward/std": 0.23643602430820465, "step": 1097 }, { @@ -31828,27 +31828,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.16015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 878.9140625, - "completions/mean_terminated_length": 826.4244384765625, - "completions/min_length": 9.0, - "completions/min_terminated_length": 9.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 993.1953125, + "completions/mean_terminated_length": 792.0465087890625, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.37483997610309805, - "grad_norm": 1.2832894325256348, - "kl": 5.66796875, - "learning_rate": 8.084386587306566e-07, - "loss": 0.3312, - "num_tokens": 643963929.0, - "reward": 1.74560546875, - "reward_std": 0.6264725923538208, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.755859375, - "rewards/format_reward/std": 0.42999663949012756, - "rewards/tag_count_reward/mean": 0.89599609375, - "rewards/tag_count_reward/std": 0.21907246112823486, + "grad_norm": 5.240716934204102, + "kl": 8.90625, + "learning_rate": 8.08743587686533e-07, + "loss": 0.6579, + "num_tokens": 715463606.0, + "reward": 0.9208984375, + "reward_std": 0.3361448645591736, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8408203125, + "rewards/tag_count_reward/std": 0.2700904309749603, "step": 1098 }, { @@ -31857,27 +31857,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 907.818359375, - "completions/mean_terminated_length": 834.334716796875, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_terminated_length": 1874.0, + "completions/mean_length": 910.236328125, + "completions/mean_terminated_length": 753.477783203125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.37518136041648886, - "grad_norm": 1.0897924900054932, - "kl": 7.1953125, - "learning_rate": 8.079995964283688e-07, - "loss": 0.4779, - "num_tokens": 644510588.0, - "reward": 1.6796875, - "reward_std": 0.6540594100952148, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.728515625, - "rewards/format_reward/std": 0.44516023993492126, - "rewards/tag_count_reward/mean": 0.876953125, - "rewards/tag_count_reward/std": 0.23091770708560944, + "grad_norm": 3.259673833847046, + "kl": 8.578125, + "learning_rate": 8.083046137091285e-07, + "loss": 0.6051, + "num_tokens": 716011503.0, + "reward": 0.91064453125, + "reward_std": 0.30505913496017456, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.85009765625, + "rewards/tag_count_reward/std": 0.26425132155418396, "step": 1099 }, { @@ -31886,27 +31886,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.146484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1932.0, - "completions/mean_length": 857.712890625, - "completions/mean_terminated_length": 811.8397216796875, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/max_terminated_length": 1915.0, + "completions/mean_length": 949.265625, + "completions/mean_terminated_length": 760.6956176757812, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.3755227447298797, - "grad_norm": 1.4612879753112793, - "kl": 5.4609375, - "learning_rate": 8.075601679428427e-07, - "loss": 0.3507, - "num_tokens": 645027049.0, - "reward": 1.69189453125, - "reward_std": 0.6441332101821899, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.748046875, - "rewards/format_reward/std": 0.43455907702445984, - "rewards/tag_count_reward/mean": 0.89501953125, - "rewards/tag_count_reward/std": 0.2152220904827118, + "grad_norm": 4.003683090209961, + "kl": 9.96875, + "learning_rate": 8.078652728373558e-07, + "loss": 0.7118, + "num_tokens": 716574839.0, + "reward": 0.8876953125, + "reward_std": 0.31696707010269165, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8349609375, + "rewards/tag_count_reward/std": 0.276454359292984, "step": 1100 }, { @@ -31915,27 +31915,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.16796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 815.390625, - "completions/mean_terminated_length": 775.6290283203125, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/max_terminated_length": 1808.0, + "completions/mean_length": 940.173828125, + "completions/mean_terminated_length": 716.5281982421875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.37586412904327043, - "grad_norm": 1.3878822326660156, - "kl": 5.3828125, - "learning_rate": 8.071203738977667e-07, - "loss": 0.3169, - "num_tokens": 645519089.0, - "reward": 1.75, - "reward_std": 0.6552772521972656, - "rewards/accuracy_reward/mean": 0.12708333134651184, - "rewards/accuracy_reward/std": 0.3334137797355652, - "rewards/format_reward/mean": 0.740234375, - "rewards/format_reward/std": 0.4389347732067108, - "rewards/tag_count_reward/mean": 0.890625, - "rewards/tag_count_reward/std": 0.2244802713394165, + "grad_norm": 5.373190879821777, + "kl": 10.734375, + "learning_rate": 8.07425565695252e-07, + "loss": 0.737, + "num_tokens": 717130768.0, + "reward": 0.90576171875, + "reward_std": 0.355996310710907, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29178470373153687, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.81787109375, + "rewards/tag_count_reward/std": 0.2924489378929138, "step": 1101 }, { @@ -31944,27 +31944,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.134765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 888.880859375, - "completions/mean_terminated_length": 839.3055419921875, - "completions/min_length": 65.0, - "completions/min_terminated_length": 65.0, + "completions/max_terminated_length": 1899.0, + "completions/mean_length": 993.9921875, + "completions/mean_terminated_length": 829.823974609375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.37620551335666125, - "grad_norm": 3.209505796432495, - "kl": 6.3984375, - "learning_rate": 8.066802149173479e-07, - "loss": 0.3408, - "num_tokens": 646054292.0, - "reward": 1.58984375, - "reward_std": 0.704230546951294, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.669921875, - "rewards/format_reward/std": 0.47070086002349854, - "rewards/tag_count_reward/mean": 0.8515625, - "rewards/tag_count_reward/std": 0.2451835423707962, + "grad_norm": 3.4670822620391846, + "kl": 9.4609375, + "learning_rate": 8.069854929073746e-07, + "loss": 0.6363, + "num_tokens": 717719788.0, + "reward": 0.88720703125, + "reward_std": 0.34606945514678955, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83056640625, + "rewards/tag_count_reward/std": 0.28278782963752747, "step": 1102 }, { @@ -31973,27 +31973,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.103515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 791.171875, - "completions/mean_terminated_length": 745.3765258789062, - "completions/min_length": 101.0, - "completions/min_terminated_length": 101.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 839.98046875, + "completions/mean_terminated_length": 700.4923706054688, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, "epoch": 0.37654689767005206, - "grad_norm": 2.4412620067596436, - "kl": 5.25, - "learning_rate": 8.062396916263112e-07, - "loss": 0.3245, - "num_tokens": 646532140.0, - "reward": 1.7099609375, - "reward_std": 0.6323409676551819, + "grad_norm": 5.100794792175293, + "kl": 10.140625, + "learning_rate": 8.065450550988003e-07, + "loss": 0.7634, + "num_tokens": 718222626.0, + "reward": 0.8857421875, + "reward_std": 0.3373267650604248, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.748046875, - "rewards/format_reward/std": 0.43455907702445984, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.20921388268470764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8212890625, + "rewards/tag_count_reward/std": 0.28548699617385864, "step": 1103 }, { @@ -32002,27 +32002,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1930.0, - "completions/mean_length": 780.115234375, - "completions/mean_terminated_length": 752.2774658203125, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 865.447265625, + "completions/mean_terminated_length": 748.714599609375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.3768882819834429, - "grad_norm": 1.7001005411148071, - "kl": 5.140625, - "learning_rate": 8.057988046498993e-07, - "loss": 0.3435, - "num_tokens": 647011143.0, - "reward": 1.74462890625, - "reward_std": 0.6385067105293274, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.76953125, - "rewards/format_reward/std": 0.42154473066329956, - "rewards/tag_count_reward/mean": 0.90478515625, - "rewards/tag_count_reward/std": 0.20946665108203888, + "grad_norm": 7.350658416748047, + "kl": 8.21875, + "learning_rate": 8.061042528951246e-07, + "loss": 0.6297, + "num_tokens": 718745319.0, + "reward": 0.86865234375, + "reward_std": 0.3269660174846649, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.82763671875, + "rewards/tag_count_reward/std": 0.28746289014816284, "step": 1104 }, { @@ -32031,27 +32031,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 832.859375, - "completions/mean_terminated_length": 801.202392578125, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 904.81640625, + "completions/mean_terminated_length": 791.969970703125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.3772296662968337, - "grad_norm": 2.0844802856445312, - "kl": 4.96875, - "learning_rate": 8.053575546138706e-07, - "loss": 0.3106, - "num_tokens": 647509087.0, - "reward": 1.6865234375, - "reward_std": 0.628883957862854, - "rewards/accuracy_reward/mean": 0.04233871027827263, - "rewards/accuracy_reward/std": 0.2015640139579773, - "rewards/format_reward/mean": 0.75390625, - "rewards/format_reward/std": 0.4311550557613373, - "rewards/tag_count_reward/mean": 0.8916015625, - "rewards/tag_count_reward/std": 0.22000661492347717, + "grad_norm": 3.2291688919067383, + "kl": 8.3671875, + "learning_rate": 8.056630869224602e-07, + "loss": 0.5834, + "num_tokens": 719280105.0, + "reward": 0.87939453125, + "reward_std": 0.3762778639793396, + "rewards/accuracy_reward/mean": 0.0786290317773819, + "rewards/accuracy_reward/std": 0.26943066716194153, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.80322265625, + "rewards/tag_count_reward/std": 0.2995828092098236, "step": 1105 }, { @@ -32060,27 +32060,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 808.99609375, - "completions/mean_terminated_length": 763.8502197265625, - "completions/min_length": 15.0, - "completions/min_terminated_length": 15.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 865.216796875, + "completions/mean_terminated_length": 748.4613647460938, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.37757105061022445, - "grad_norm": 1.9999911785125732, - "kl": 6.65625, - "learning_rate": 8.049159421444986e-07, - "loss": 0.4405, - "num_tokens": 647994125.0, - "reward": 1.72216796875, - "reward_std": 0.6616029739379883, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.763671875, - "rewards/format_reward/std": 0.42524150013923645, - "rewards/tag_count_reward/mean": 0.89208984375, - "rewards/tag_count_reward/std": 0.22975978255271912, + "grad_norm": 7.395659923553467, + "kl": 7.5859375, + "learning_rate": 8.052215578074369e-07, + "loss": 0.5442, + "num_tokens": 719793928.0, + "reward": 0.890625, + "reward_std": 0.3740348219871521, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.296753466129303, "step": 1106 }, { @@ -32089,27 +32089,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.095703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1937.0, - "completions/mean_length": 903.98828125, - "completions/mean_terminated_length": 867.0846557617188, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 961.396484375, + "completions/mean_terminated_length": 846.3995361328125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.37791243492361526, - "grad_norm": 1.6477216482162476, - "kl": 5.2890625, - "learning_rate": 8.044739678685713e-07, - "loss": 0.2957, - "num_tokens": 648545143.0, - "reward": 1.7177734375, - "reward_std": 0.5774356126785278, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.759765625, - "rewards/format_reward/std": 0.4276435375213623, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.193101167678833, + "grad_norm": 4.569037914276123, + "kl": 7.8984375, + "learning_rate": 8.047796661771999e-07, + "loss": 0.5551, + "num_tokens": 720374339.0, + "reward": 0.84521484375, + "reward_std": 0.32073554396629333, + "rewards/accuracy_reward/mean": 0.029296875, + "rewards/accuracy_reward/std": 0.16880230605602264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.81591796875, + "rewards/tag_count_reward/std": 0.28312888741493225, "step": 1107 }, { @@ -32118,27 +32118,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1948.0, - "completions/mean_length": 830.173828125, - "completions/mean_terminated_length": 790.8890991210938, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 902.169921875, + "completions/mean_terminated_length": 744.2999877929688, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.3782538192370061, - "grad_norm": 2.0305397510528564, - "kl": 6.46875, - "learning_rate": 8.040316324133907e-07, - "loss": 0.3835, - "num_tokens": 649045344.0, - "reward": 1.8037109375, - "reward_std": 0.6847227215766907, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, - "rewards/format_reward/mean": 0.775390625, - "rewards/format_reward/std": 0.41773295402526855, - "rewards/tag_count_reward/mean": 0.9052734375, - "rewards/tag_count_reward/std": 0.20997978746891022, + "grad_norm": 6.202198028564453, + "kl": 8.6484375, + "learning_rate": 8.043374126594095e-07, + "loss": 0.6576, + "num_tokens": 720911402.0, + "reward": 0.91650390625, + "reward_std": 0.3903951644897461, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.80712890625, + "rewards/tag_count_reward/std": 0.2914015054702759, "step": 1108 }, { @@ -32147,27 +32147,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.00390625, + "completions/clipped_ratio": 0.1796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 719.869140625, - "completions/mean_terminated_length": 714.6608276367188, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 931.578125, + "completions/mean_terminated_length": 687.0286254882812, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, "epoch": 0.3785952035503969, - "grad_norm": 1.0011018514633179, - "kl": 4.45703125, - "learning_rate": 8.035889364067709e-07, - "loss": 0.2692, - "num_tokens": 649489821.0, - "reward": 1.90478515625, - "reward_std": 0.5357630252838135, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.93798828125, - "rewards/tag_count_reward/std": 0.16971616446971893, + "grad_norm": 7.712705612182617, + "kl": 11.3125, + "learning_rate": 8.038947978822401e-07, + "loss": 0.8177, + "num_tokens": 721464274.0, + "reward": 0.92041015625, + "reward_std": 0.4082567095756531, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.79345703125, + "rewards/tag_count_reward/std": 0.2962438762187958, "step": 1109 }, { @@ -32176,27 +32176,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.19921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 806.626953125, - "completions/mean_terminated_length": 776.8340454101562, - "completions/min_length": 229.0, - "completions/min_terminated_length": 229.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 1026.70703125, + "completions/mean_terminated_length": 772.6292724609375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.37893658786378764, - "grad_norm": 2.0366158485412598, - "kl": 5.234375, - "learning_rate": 8.031458804770379e-07, - "loss": 0.3228, - "num_tokens": 649982926.0, - "reward": 1.8173828125, - "reward_std": 0.5892714262008667, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.19016924500465393, + "grad_norm": 8.70825481414795, + "kl": 11.125, + "learning_rate": 8.034518224743791e-07, + "loss": 0.7986, + "num_tokens": 722070060.0, + "reward": 0.85400390625, + "reward_std": 0.36291998624801636, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.78564453125, + "rewards/tag_count_reward/std": 0.29687538743019104, "step": 1110 }, { @@ -32205,27 +32205,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.24609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 872.439453125, - "completions/mean_terminated_length": 809.5493774414062, - "completions/min_length": 35.0, - "completions/min_terminated_length": 35.0, + "completions/max_terminated_length": 1817.0, + "completions/mean_length": 1098.173828125, + "completions/mean_terminated_length": 788.126953125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, "epoch": 0.37927797217717846, - "grad_norm": 1.166548490524292, - "kl": 5.96875, - "learning_rate": 8.027024652530285e-07, - "loss": 0.3623, - "num_tokens": 650507407.0, - "reward": 1.775390625, - "reward_std": 0.6269536018371582, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.779296875, - "rewards/format_reward/std": 0.4151262938976288, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.20925270020961761, + "grad_norm": 5.598191738128662, + "kl": 9.875, + "learning_rate": 8.030084870650261e-07, + "loss": 0.7457, + "num_tokens": 722710117.0, + "reward": 0.86572265625, + "reward_std": 0.38635337352752686, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.77587890625, + "rewards/tag_count_reward/std": 0.30518868565559387, "step": 1111 }, { @@ -32234,27 +32234,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.29296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 744.90625, - "completions/mean_terminated_length": 724.2222900390625, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/max_terminated_length": 1639.0, + "completions/mean_length": 1046.9296875, + "completions/mean_terminated_length": 632.12158203125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.37961935649056927, - "grad_norm": 0.8393130898475647, - "kl": 5.328125, - "learning_rate": 8.022586913640896e-07, - "loss": 0.334, - "num_tokens": 650965071.0, - "reward": 1.85791015625, - "reward_std": 0.4956350326538086, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.18965290486812592, + "grad_norm": 10.496648788452148, + "kl": 10.5, + "learning_rate": 8.025647922838923e-07, + "loss": 0.8676, + "num_tokens": 723322417.0, + "reward": 0.85791015625, + "reward_std": 0.3202684223651886, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.77392578125, + "rewards/tag_count_reward/std": 0.29433465003967285, "step": 1112 }, { @@ -32263,27 +32263,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.330078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1825.0, - "completions/mean_length": 801.142578125, - "completions/mean_terminated_length": 758.3212280273438, - "completions/min_length": 254.0, - "completions/min_terminated_length": 254.0, + "completions/max_terminated_length": 1627.0, + "completions/mean_length": 1136.703125, + "completions/mean_terminated_length": 687.6968383789062, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.3799607408039601, - "grad_norm": 1.9378595352172852, - "kl": 6.4921875, - "learning_rate": 8.018145594400772e-07, - "loss": 0.4415, - "num_tokens": 651452248.0, - "reward": 1.86279296875, - "reward_std": 0.6022834777832031, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.19962850213050842, + "grad_norm": 5.600003719329834, + "kl": 11.09375, + "learning_rate": 8.021207387611991e-07, + "loss": 0.8709, + "num_tokens": 723981401.0, + "reward": 0.85888671875, + "reward_std": 0.3840753436088562, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.77099609375, + "rewards/tag_count_reward/std": 0.29414305090904236, "step": 1113 }, { @@ -32292,27 +32292,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.380859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1958.0, - "completions/mean_length": 818.1953125, - "completions/mean_terminated_length": 786.1563110351562, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 1229.384765625, + "completions/mean_terminated_length": 725.8201904296875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.38030212511735084, - "grad_norm": 3.2051782608032227, - "kl": 6.484375, - "learning_rate": 8.01370070111355e-07, - "loss": 0.3638, - "num_tokens": 651938412.0, - "reward": 1.802734375, - "reward_std": 0.5860434770584106, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.79296875, - "rewards/format_reward/std": 0.40557438135147095, - "rewards/tag_count_reward/mean": 0.91015625, - "rewards/tag_count_reward/std": 0.1984144002199173, + "grad_norm": 10.896602630615234, + "kl": 13.609375, + "learning_rate": 8.016763271276776e-07, + "loss": 0.9434, + "num_tokens": 724678094.0, + "reward": 0.828125, + "reward_std": 0.40060293674468994, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.732421875, + "rewards/tag_count_reward/std": 0.3083692193031311, "step": 1114 }, { @@ -32321,27 +32321,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.32421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 722.341796875, - "completions/mean_terminated_length": 698.6222534179688, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1124.087890625, + "completions/mean_terminated_length": 680.8236694335938, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, "epoch": 0.38064350943074166, - "grad_norm": 1.0053163766860962, - "kl": 4.33203125, - "learning_rate": 8.009252240087947e-07, - "loss": 0.244, - "num_tokens": 652384875.0, - "reward": 1.8671875, - "reward_std": 0.5169427990913391, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.16478058695793152, + "grad_norm": 7.6408209800720215, + "kl": 12.453125, + "learning_rate": 8.012315580145675e-07, + "loss": 0.9452, + "num_tokens": 725330251.0, + "reward": 0.82958984375, + "reward_std": 0.3632813096046448, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.77099609375, + "rewards/tag_count_reward/std": 0.2933102548122406, "step": 1115 }, { @@ -32350,27 +32350,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.322265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1942.0, - "completions/mean_length": 739.279296875, - "completions/mean_terminated_length": 705.1843872070312, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 1887.0, + "completions/mean_length": 1106.79296875, + "completions/mean_terminated_length": 659.2449340820312, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.38098489374413247, - "grad_norm": 1.4230599403381348, - "kl": 5.28125, - "learning_rate": 8.004800217637736e-07, - "loss": 0.3337, - "num_tokens": 652837802.0, - "reward": 1.7919921875, - "reward_std": 0.5027107000350952, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.18952499330043793, + "grad_norm": 11.412835121154785, + "kl": 13.1875, + "learning_rate": 8.00786432053616e-07, + "loss": 1.0127, + "num_tokens": 725971345.0, + "reward": 0.81298828125, + "reward_std": 0.31443271040916443, + "rewards/accuracy_reward/mean": 0.037109375, + "rewards/accuracy_reward/std": 0.18921469151973724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.77587890625, + "rewards/tag_count_reward/std": 0.2945845425128937, "step": 1116 }, { @@ -32379,27 +32379,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.294921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 814.55859375, - "completions/mean_terminated_length": 769.6154174804688, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 1719.0, + "completions/mean_length": 1110.19921875, + "completions/mean_terminated_length": 717.9334716796875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.3813262780575233, - "grad_norm": 2.010937452316284, - "kl": 5.62890625, - "learning_rate": 8.000344640081752e-07, - "loss": 0.3261, - "num_tokens": 653342456.0, - "reward": 1.69970703125, - "reward_std": 0.6149077415466309, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.74609375, - "rewards/format_reward/std": 0.43567025661468506, - "rewards/tag_count_reward/mean": 0.89501953125, - "rewards/tag_count_reward/std": 0.2135103940963745, + "grad_norm": 10.525016784667969, + "kl": 13.125, + "learning_rate": 8.003409498770777e-07, + "loss": 0.9543, + "num_tokens": 726627367.0, + "reward": 0.82666015625, + "reward_std": 0.33735811710357666, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.77587890625, + "rewards/tag_count_reward/std": 0.2958274781703949, "step": 1117 }, { @@ -32408,27 +32408,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.26171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 747.55859375, - "completions/mean_terminated_length": 713.6793823242188, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 1848.0, + "completions/mean_length": 1032.025390625, + "completions/mean_terminated_length": 671.8650512695312, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.38166766237091404, - "grad_norm": 1.4754900932312012, - "kl": 5.3828125, - "learning_rate": 7.99588551374387e-07, - "loss": 0.366, - "num_tokens": 653798374.0, - "reward": 1.869140625, - "reward_std": 0.5730654001235962, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.18268859386444092, + "grad_norm": 12.7598876953125, + "kl": 13.640625, + "learning_rate": 7.998951121177129e-07, + "loss": 0.9398, + "num_tokens": 727228932.0, + "reward": 0.8720703125, + "reward_std": 0.37326380610466003, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.7919921875, + "rewards/tag_count_reward/std": 0.295422226190567, "step": 1118 }, { @@ -32437,27 +32437,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.3203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 774.439453125, - "completions/mean_terminated_length": 743.8740234375, - "completions/min_length": 35.0, - "completions/min_terminated_length": 35.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1146.90625, + "completions/mean_terminated_length": 722.2528686523438, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.38200904668430485, - "grad_norm": 1.3872768878936768, - "kl": 5.78125, - "learning_rate": 7.991422844953004e-07, - "loss": 0.378, - "num_tokens": 654269255.0, - "reward": 1.75732421875, - "reward_std": 0.5963304042816162, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.794921875, - "rewards/format_reward/std": 0.4041535556316376, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.20034043490886688, + "grad_norm": 14.74610710144043, + "kl": 14.171875, + "learning_rate": 7.994489194087868e-07, + "loss": 0.9658, + "num_tokens": 727890516.0, + "reward": 0.814453125, + "reward_std": 0.3475879430770874, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.767578125, + "rewards/tag_count_reward/std": 0.3043770492076874, "step": 1119 }, { @@ -32466,27 +32466,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.232421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 711.677734375, - "completions/mean_terminated_length": 676.8637084960938, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 1715.0, + "completions/mean_length": 993.740234375, + "completions/mean_terminated_length": 674.511474609375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.38235043099769567, - "grad_norm": 1.223705768585205, - "kl": 5.46484375, - "learning_rate": 7.986956640043096e-07, - "loss": 0.342, - "num_tokens": 654710114.0, - "reward": 1.85205078125, - "reward_std": 0.6124395132064819, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.19362683594226837, + "grad_norm": 17.21853256225586, + "kl": 13.0, + "learning_rate": 7.990023723840689e-07, + "loss": 0.9065, + "num_tokens": 728475791.0, + "reward": 0.9208984375, + "reward_std": 0.4287932515144348, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7880859375, + "rewards/tag_count_reward/std": 0.29842156171798706, "step": 1120 }, { @@ -32495,27 +32495,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.208984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 857.865234375, - "completions/mean_terminated_length": 814.5, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1068.126953125, + "completions/mean_terminated_length": 809.2469482421875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.3826918153110865, - "grad_norm": 0.999646008014679, - "kl": 5.953125, - "learning_rate": 7.98248690535311e-07, - "loss": 0.3592, - "num_tokens": 655231117.0, - "reward": 1.7109375, - "reward_std": 0.6151527166366577, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.751953125, - "rewards/format_reward/std": 0.4323015511035919, - "rewards/tag_count_reward/mean": 0.892578125, - "rewards/tag_count_reward/std": 0.21315263211727142, + "grad_norm": 10.721884727478027, + "kl": 10.4375, + "learning_rate": 7.985554716778323e-07, + "loss": 0.7515, + "num_tokens": 729104448.0, + "reward": 0.88525390625, + "reward_std": 0.3589593768119812, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.80517578125, + "rewards/tag_count_reward/std": 0.28882914781570435, "step": 1121 }, { @@ -32524,27 +32524,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.16015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 786.4609375, - "completions/mean_terminated_length": 756.1840209960938, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 980.689453125, + "completions/mean_terminated_length": 777.1558227539062, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, "epoch": 0.38303319962447724, - "grad_norm": 1.6539634466171265, - "kl": 4.64453125, - "learning_rate": 7.978013647227015e-07, - "loss": 0.2793, - "num_tokens": 655705001.0, - "reward": 1.8173828125, - "reward_std": 0.6110129356384277, - "rewards/accuracy_reward/mean": 0.12298387289047241, - "rewards/accuracy_reward/std": 0.32875028252601624, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.2049688994884491, + "grad_norm": 9.72012996673584, + "kl": 9.03125, + "learning_rate": 7.981082179248519e-07, + "loss": 0.6839, + "num_tokens": 729677777.0, + "reward": 0.92578125, + "reward_std": 0.32583460211753845, + "rewards/accuracy_reward/mean": 0.09879032522439957, + "rewards/accuracy_reward/std": 0.2986815273761749, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.830078125, + "rewards/tag_count_reward/std": 0.2721293866634369, "step": 1122 }, { @@ -32553,27 +32553,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.16015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 886.181640625, - "completions/mean_terminated_length": 816.4244384765625, - "completions/min_length": 83.0, - "completions/min_terminated_length": 83.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 1065.802734375, + "completions/mean_terminated_length": 878.5, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, "epoch": 0.38337458393786805, - "grad_norm": 1.147435188293457, - "kl": 6.3125, - "learning_rate": 7.973536872013783e-07, - "loss": 0.3892, - "num_tokens": 656237638.0, - "reward": 1.67529296875, - "reward_std": 0.6209849119186401, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.736328125, - "rewards/format_reward/std": 0.4410543739795685, - "rewards/tag_count_reward/mean": 0.88427734375, - "rewards/tag_count_reward/std": 0.22537609934806824, + "grad_norm": 13.961073875427246, + "kl": 9.46875, + "learning_rate": 7.976606117604041e-07, + "loss": 0.6746, + "num_tokens": 730302380.0, + "reward": 0.83984375, + "reward_std": 0.33539026975631714, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.787109375, + "rewards/tag_count_reward/std": 0.289391428232193, "step": 1123 }, { @@ -32582,27 +32582,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 798.8828125, - "completions/mean_terminated_length": 755.98388671875, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 1922.0, + "completions/mean_length": 926.255859375, + "completions/mean_terminated_length": 815.5257568359375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.38371596825125887, - "grad_norm": 2.6214802265167236, - "kl": 6.09375, - "learning_rate": 7.969056586067376e-07, - "loss": 0.4175, - "num_tokens": 656716842.0, - "reward": 1.7373046875, - "reward_std": 0.6505454778671265, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.771484375, - "rewards/format_reward/std": 0.4202871024608612, - "rewards/tag_count_reward/mean": 0.8876953125, - "rewards/tag_count_reward/std": 0.22247707843780518, + "grad_norm": 13.550067901611328, + "kl": 8.0546875, + "learning_rate": 7.972126538202666e-07, + "loss": 0.5458, + "num_tokens": 730846799.0, + "reward": 0.8974609375, + "reward_std": 0.3296496868133545, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8251953125, + "rewards/tag_count_reward/std": 0.2625703811645508, "step": 1124 }, { @@ -32611,27 +32611,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 783.39453125, - "completions/mean_terminated_length": 737.3157958984375, - "completions/min_length": 114.0, - "completions/min_terminated_length": 114.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 904.8359375, + "completions/mean_terminated_length": 848.6146850585938, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.3840573525646497, - "grad_norm": 1.1501455307006836, - "kl": 6.0234375, - "learning_rate": 7.964572795746741e-07, - "loss": 0.3478, - "num_tokens": 657195524.0, - "reward": 1.712890625, - "reward_std": 0.6205965280532837, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.7578125, - "rewards/format_reward/std": 0.42882615327835083, - "rewards/tag_count_reward/mean": 0.896484375, - "rewards/tag_count_reward/std": 0.21790531277656555, + "grad_norm": 6.750629425048828, + "kl": 6.359375, + "learning_rate": 7.967643447407161e-07, + "loss": 0.4912, + "num_tokens": 731387659.0, + "reward": 0.93994140625, + "reward_std": 0.3083881139755249, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86181640625, + "rewards/tag_count_reward/std": 0.24193856120109558, "step": 1125 }, { @@ -32640,27 +32640,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 775.951171875, - "completions/mean_terminated_length": 729.6012573242188, - "completions/min_length": 35.0, - "completions/min_terminated_length": 35.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 923.091796875, + "completions/mean_terminated_length": 867.7683715820312, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.38439873687804044, - "grad_norm": 1.1001150608062744, - "kl": 5.9609375, - "learning_rate": 7.960085507415802e-07, - "loss": 0.3623, - "num_tokens": 657662219.0, - "reward": 1.6796875, - "reward_std": 0.6390104293823242, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.734375, - "rewards/format_reward/std": 0.44209739565849304, - "rewards/tag_count_reward/mean": 0.892578125, - "rewards/tag_count_reward/std": 0.21825583279132843, + "grad_norm": 4.424609184265137, + "kl": 5.0859375, + "learning_rate": 7.963156851585279e-07, + "loss": 0.3709, + "num_tokens": 731929690.0, + "reward": 0.95947265625, + "reward_std": 0.2986663281917572, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.87158203125, + "rewards/tag_count_reward/std": 0.23195762932300568, "step": 1126 }, { @@ -32669,27 +32669,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 785.125, - "completions/mean_terminated_length": 733.78857421875, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 959.375, + "completions/mean_terminated_length": 926.5191040039062, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.38474012119143125, - "grad_norm": 1.8485504388809204, - "kl": 5.0859375, - "learning_rate": 7.955594727443439e-07, - "loss": 0.3444, - "num_tokens": 658146091.0, - "reward": 1.81298828125, - "reward_std": 0.643386960029602, - "rewards/accuracy_reward/mean": 0.138671875, - "rewards/accuracy_reward/std": 0.34594178199768066, - "rewards/format_reward/mean": 0.771484375, - "rewards/format_reward/std": 0.4202871024608612, - "rewards/tag_count_reward/mean": 0.90283203125, - "rewards/tag_count_reward/std": 0.21089871227741241, + "grad_norm": 12.789087295532227, + "kl": 3.37109375, + "learning_rate": 7.958666757109757e-07, + "loss": 0.2937, + "num_tokens": 732502778.0, + "reward": 1.0625, + "reward_std": 0.279398113489151, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.900390625, + "rewards/tag_count_reward/std": 0.20413975417613983, "step": 1127 }, { @@ -32698,27 +32698,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 788.6328125, - "completions/mean_terminated_length": 748.008056640625, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 947.91796875, + "completions/mean_terminated_length": 914.7162475585938, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, "epoch": 0.38508150550482206, - "grad_norm": 1.121274709701538, - "kl": 6.5390625, - "learning_rate": 7.951100462203494e-07, - "loss": 0.3914, - "num_tokens": 658626671.0, - "reward": 1.7099609375, - "reward_std": 0.6317379474639893, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.75390625, - "rewards/format_reward/std": 0.4311550557613373, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.21326690912246704, + "grad_norm": 19.45659065246582, + "kl": 3.0546875, + "learning_rate": 7.954173170358298e-07, + "loss": 0.2943, + "num_tokens": 733064912.0, + "reward": 0.97509765625, + "reward_std": 0.2574610114097595, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90869140625, + "rewards/tag_count_reward/std": 0.19928355515003204, "step": 1128 }, { @@ -32727,27 +32727,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 779.080078125, - "completions/mean_terminated_length": 735.5010375976562, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 906.26953125, + "completions/mean_terminated_length": 885.8409423828125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.3854228898182129, - "grad_norm": 1.466565728187561, - "kl": 5.6484375, - "learning_rate": 7.946602718074756e-07, - "loss": 0.3559, - "num_tokens": 659094696.0, - "reward": 1.72265625, - "reward_std": 0.604473352432251, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.76953125, - "rewards/format_reward/std": 0.42154473066329956, - "rewards/tag_count_reward/mean": 0.91015625, - "rewards/tag_count_reward/std": 0.20147298276424408, + "grad_norm": 4.332626819610596, + "kl": 3.232421875, + "learning_rate": 7.949676097713569e-07, + "loss": 0.2374, + "num_tokens": 733598058.0, + "reward": 0.98046875, + "reward_std": 0.22441324591636658, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.18112322688102722, "step": 1129 }, { @@ -32756,27 +32756,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 848.818359375, - "completions/mean_terminated_length": 802.6024169921875, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 964.23828125, + "completions/mean_terminated_length": 938.22802734375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.38576427413160363, - "grad_norm": 1.2333524227142334, - "kl": 6.3125, - "learning_rate": 7.94210150144095e-07, - "loss": 0.3712, - "num_tokens": 659616603.0, - "reward": 1.7236328125, - "reward_std": 0.6640071868896484, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.7578125, - "rewards/format_reward/std": 0.42882615327835083, - "rewards/tag_count_reward/mean": 0.8876953125, - "rewards/tag_count_reward/std": 0.2284444123506546, + "grad_norm": 5.654244422912598, + "kl": 2.9296875, + "learning_rate": 7.945175545563182e-07, + "loss": 0.2452, + "num_tokens": 734179060.0, + "reward": 1.03759765625, + "reward_std": 0.2827211618423462, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.19104842841625214, "step": 1130 }, { @@ -32785,27 +32785,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 793.828125, - "completions/mean_terminated_length": 750.755615234375, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, - "epoch": 0.38610565844499445, - "grad_norm": 1.7627885341644287, - "kl": 5.8515625, - "learning_rate": 7.937596818690729e-07, - "loss": 0.3843, - "num_tokens": 660099267.0, - "reward": 1.755859375, - "reward_std": 0.5984328389167786, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.767578125, - "rewards/format_reward/std": 0.42278963327407837, - "rewards/tag_count_reward/mean": 0.90234375, - "rewards/tag_count_reward/std": 0.2062724232673645, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 900.505859375, + "completions/mean_terminated_length": 868.2469482421875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.38610565844499445, + "grad_norm": 9.697778701782227, + "kl": 3.4921875, + "learning_rate": 7.940671520299697e-07, + "loss": 0.2896, + "num_tokens": 734716343.0, + "reward": 0.9912109375, + "reward_std": 0.25344809889793396, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8994140625, + "rewards/tag_count_reward/std": 0.19879689812660217, "step": 1131 }, { @@ -32814,27 +32814,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 851.26953125, - "completions/mean_terminated_length": 779.4161376953125, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 849.005859375, + "completions/mean_terminated_length": 829.9742431640625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.38644704275838526, - "grad_norm": 1.089171051979065, - "kl": 7.8046875, - "learning_rate": 7.933088676217667e-07, - "loss": 0.4961, - "num_tokens": 660611261.0, - "reward": 1.72021484375, - "reward_std": 0.6671257019042969, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.751953125, - "rewards/format_reward/std": 0.4323015511035919, - "rewards/tag_count_reward/mean": 0.88623046875, - "rewards/tag_count_reward/std": 0.22637026011943817, + "grad_norm": 2.7983529567718506, + "kl": 2.77734375, + "learning_rate": 7.936164028320608e-07, + "loss": 0.1988, + "num_tokens": 735227178.0, + "reward": 1.03955078125, + "reward_std": 0.2647428810596466, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.16524890065193176, "step": 1132 }, { @@ -32843,27 +32843,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 847.890625, - "completions/mean_terminated_length": 783.6871948242188, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 892.388671875, + "completions/mean_terminated_length": 862.2825927734375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.3867884270717761, - "grad_norm": 1.5207698345184326, - "kl": 6.5, - "learning_rate": 7.928577080420247e-07, - "loss": 0.3852, - "num_tokens": 661132309.0, - "reward": 1.7197265625, - "reward_std": 0.6027827262878418, - "rewards/accuracy_reward/mean": 0.052419353276491165, - "rewards/accuracy_reward/std": 0.22309619188308716, - "rewards/format_reward/mean": 0.771484375, - "rewards/format_reward/std": 0.4202871024608612, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.2189268320798874, + "grad_norm": 2.7676846981048584, + "kl": 4.06640625, + "learning_rate": 7.931653076028325e-07, + "loss": 0.2835, + "num_tokens": 735771009.0, + "reward": 0.994140625, + "reward_std": 0.2430516928434372, + "rewards/accuracy_reward/mean": 0.08870967477560043, + "rewards/accuracy_reward/std": 0.284611314535141, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.19936566054821014, "step": 1133 }, { @@ -32872,27 +32872,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 809.419921875, - "completions/mean_terminated_length": 756.446044921875, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 832.77734375, + "completions/mean_terminated_length": 796.1005859375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, "epoch": 0.38712981138516683, - "grad_norm": 1.6754188537597656, - "kl": 6.68359375, - "learning_rate": 7.924062037701853e-07, - "loss": 0.3849, - "num_tokens": 661626140.0, - "reward": 1.75048828125, - "reward_std": 0.6199695467948914, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.7578125, - "rewards/format_reward/std": 0.42882615327835083, - "rewards/tag_count_reward/mean": 0.89306640625, - "rewards/tag_count_reward/std": 0.22538457810878754, + "grad_norm": 2.502657651901245, + "kl": 3.6494140625, + "learning_rate": 7.927138669830181e-07, + "loss": 0.2675, + "num_tokens": 736276799.0, + "reward": 1.0517578125, + "reward_std": 0.26048845052719116, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.19831565022468567, "step": 1134 }, { @@ -32901,27 +32901,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 838.259765625, - "completions/mean_terminated_length": 783.9448852539062, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 892.1484375, + "completions/mean_terminated_length": 857.2635498046875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.38747119569855765, - "grad_norm": 4.741856098175049, - "kl": 6.609375, - "learning_rate": 7.919543554470763e-07, - "loss": 0.433, - "num_tokens": 662139985.0, - "reward": 1.736328125, - "reward_std": 0.6018377542495728, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.7734375, - "rewards/format_reward/std": 0.4190165400505066, - "rewards/tag_count_reward/mean": 0.900390625, - "rewards/tag_count_reward/std": 0.2146535962820053, + "grad_norm": 8.417322158813477, + "kl": 4.125, + "learning_rate": 7.922620816138412e-07, + "loss": 0.3387, + "num_tokens": 736818235.0, + "reward": 1.0068359375, + "reward_std": 0.2449808418750763, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9091796875, + "rewards/tag_count_reward/std": 0.19920024275779724, "step": 1135 }, { @@ -32930,27 +32930,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 834.3125, - "completions/mean_terminated_length": 797.6820678710938, - "completions/min_length": 21.0, - "completions/min_terminated_length": 21.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 958.462890625, + "completions/mean_terminated_length": 897.8082885742188, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, "epoch": 0.38781258001194846, - "grad_norm": 3.1156985759735107, - "kl": 5.0703125, - "learning_rate": 7.91502163714014e-07, - "loss": 0.3285, - "num_tokens": 662650705.0, - "reward": 1.7841796875, - "reward_std": 0.6189864873886108, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.79296875, - "rewards/format_reward/std": 0.40557438135147095, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.20575061440467834, + "grad_norm": 4.6951212882995605, + "kl": 6.53125, + "learning_rate": 7.918099521370152e-07, + "loss": 0.447, + "num_tokens": 737392520.0, + "reward": 0.958984375, + "reward_std": 0.3053321838378906, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.23221340775489807, "step": 1136 }, { @@ -32959,27 +32959,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 784.7890625, - "completions/mean_terminated_length": 741.4060668945312, - "completions/min_length": 52.0, - "completions/min_terminated_length": 52.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 807.927734375, + "completions/mean_terminated_length": 767.9253540039062, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.3881539643253393, - "grad_norm": 1.924870252609253, - "kl": 5.875, - "learning_rate": 7.910496292128015e-07, - "loss": 0.3487, - "num_tokens": 663132341.0, - "reward": 1.78173828125, - "reward_std": 0.6319122314453125, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, - "rewards/format_reward/mean": 0.76171875, - "rewards/format_reward/std": 0.42644867300987244, - "rewards/tag_count_reward/mean": 0.89306640625, - "rewards/tag_count_reward/std": 0.21539516746997833, + "grad_norm": 4.111581325531006, + "kl": 4.2578125, + "learning_rate": 7.913574791947421e-07, + "loss": 0.3014, + "num_tokens": 737886003.0, + "reward": 1.08740234375, + "reward_std": 0.26663076877593994, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.17428277432918549, "step": 1137 }, { @@ -32988,27 +32988,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1965.0, - "completions/mean_length": 856.783203125, - "completions/mean_terminated_length": 820.8309326171875, - "completions/min_length": 61.0, - "completions/min_terminated_length": 61.0, + "completions/max_terminated_length": 1815.0, + "completions/mean_length": 896.119140625, + "completions/mean_terminated_length": 854.1477661132812, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.38849534863873003, - "grad_norm": 2.1889028549194336, - "kl": 5.28125, - "learning_rate": 7.905967525857291e-07, - "loss": 0.3409, - "num_tokens": 663646950.0, - "reward": 1.8046875, - "reward_std": 0.621527373790741, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.908203125, - "rewards/tag_count_reward/std": 0.20659643411636353, + "grad_norm": 4.895603179931641, + "kl": 5.9921875, + "learning_rate": 7.909046634297119e-07, + "loss": 0.3927, + "num_tokens": 738420752.0, + "reward": 1.03125, + "reward_std": 0.3058784008026123, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.22049468755722046, "step": 1138 }, { @@ -33017,27 +33017,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 818.861328125, - "completions/mean_terminated_length": 766.291259765625, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 865.619140625, + "completions/mean_terminated_length": 810.006103515625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, "epoch": 0.38883673295212084, - "grad_norm": 3.511672019958496, - "kl": 7.4609375, - "learning_rate": 7.901435344755721e-07, - "loss": 0.3917, - "num_tokens": 664136751.0, - "reward": 1.646484375, - "reward_std": 0.6125409007072449, - "rewards/accuracy_reward/mean": 0.03427419438958168, - "rewards/accuracy_reward/std": 0.18211629986763, - "rewards/format_reward/mean": 0.728515625, - "rewards/format_reward/std": 0.44516023993492126, - "rewards/tag_count_reward/mean": 0.884765625, - "rewards/tag_count_reward/std": 0.21250611543655396, + "grad_norm": 6.48944091796875, + "kl": 7.0625, + "learning_rate": 7.904515054851012e-07, + "loss": 0.4694, + "num_tokens": 738934493.0, + "reward": 0.97705078125, + "reward_std": 0.2511303722858429, + "rewards/accuracy_reward/mean": 0.060483869165182114, + "rewards/accuracy_reward/std": 0.2386218160390854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.19613726437091827, "step": 1139 }, { @@ -33046,27 +33046,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 795.7109375, - "completions/mean_terminated_length": 747.4482421875, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 813.705078125, + "completions/mean_terminated_length": 753.0020141601562, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.38917811726551166, - "grad_norm": 4.841424942016602, - "kl": 7.9375, - "learning_rate": 7.896899755255906e-07, - "loss": 0.4211, - "num_tokens": 664622091.0, - "reward": 1.669921875, - "reward_std": 0.6799638867378235, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.732421875, - "rewards/format_reward/std": 0.4431293308734894, - "rewards/tag_count_reward/mean": 0.865234375, - "rewards/tag_count_reward/std": 0.23177681863307953, + "grad_norm": 9.830772399902344, + "kl": 7.9765625, + "learning_rate": 7.899980060045732e-07, + "loss": 0.5002, + "num_tokens": 739429046.0, + "reward": 0.98583984375, + "reward_std": 0.30532723665237427, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.89599609375, + "rewards/tag_count_reward/std": 0.2223970592021942, "step": 1140 }, { @@ -33075,27 +33075,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 779.025390625, - "completions/mean_terminated_length": 732.7874755859375, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 819.876953125, + "completions/mean_terminated_length": 762.1124267578125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.38951950157890247, - "grad_norm": 1.623910665512085, - "kl": 7.03125, - "learning_rate": 7.892360763795291e-07, - "loss": 0.4307, - "num_tokens": 665096840.0, - "reward": 1.64697265625, - "reward_std": 0.6152798533439636, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.712890625, - "rewards/format_reward/std": 0.45285552740097046, - "rewards/tag_count_reward/mean": 0.88134765625, - "rewards/tag_count_reward/std": 0.2160595804452896, + "grad_norm": 2.7417662143707275, + "kl": 6.7890625, + "learning_rate": 7.895441656322757e-07, + "loss": 0.4905, + "num_tokens": 739924711.0, + "reward": 0.974609375, + "reward_std": 0.25067970156669617, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.20675364136695862, "step": 1141 }, { @@ -33104,27 +33104,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 815.537109375, - "completions/mean_terminated_length": 749.6028442382812, - "completions/min_length": 10.0, - "completions/min_terminated_length": 10.0, + "completions/max_terminated_length": 1878.0, + "completions/mean_length": 830.083984375, + "completions/mean_terminated_length": 759.6259765625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.38986088589229323, - "grad_norm": 2.9458398818969727, - "kl": 7.3125, - "learning_rate": 7.887818376816136e-07, - "loss": 0.426, - "num_tokens": 665586011.0, - "reward": 1.67724609375, - "reward_std": 0.6621624231338501, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.7109375, - "rewards/format_reward/std": 0.45377036929130554, - "rewards/tag_count_reward/mean": 0.88037109375, - "rewards/tag_count_reward/std": 0.22332319617271423, + "grad_norm": 3.4753756523132324, + "kl": 7.1640625, + "learning_rate": 7.890899850128413e-07, + "loss": 0.4958, + "num_tokens": 740421330.0, + "reward": 0.9931640625, + "reward_std": 0.3027256727218628, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8974609375, + "rewards/tag_count_reward/std": 0.22444407641887665, "step": 1142 }, { @@ -33133,27 +33133,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 778.068359375, - "completions/mean_terminated_length": 734.45458984375, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 828.74609375, + "completions/mean_terminated_length": 771.3987426757812, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, "epoch": 0.39020227020568404, - "grad_norm": 1.1650549173355103, - "kl": 5.578125, - "learning_rate": 7.883272600765535e-07, - "loss": 0.3751, - "num_tokens": 666067230.0, - "reward": 1.73388671875, - "reward_std": 0.7144436836242676, - "rewards/accuracy_reward/mean": 0.138671875, - "rewards/accuracy_reward/std": 0.34594178199768066, - "rewards/format_reward/mean": 0.71875, - "rewards/format_reward/std": 0.45004892349243164, - "rewards/tag_count_reward/mean": 0.87646484375, - "rewards/tag_count_reward/std": 0.2271834760904312, + "grad_norm": 2.9461019039154053, + "kl": 6.2421875, + "learning_rate": 7.886354647913851e-07, + "loss": 0.4587, + "num_tokens": 740928496.0, + "reward": 1.07275390625, + "reward_std": 0.3490486145019531, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.21004973351955414, "step": 1143 }, { @@ -33162,27 +33162,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0703125, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 880.03515625, - "completions/mean_terminated_length": 791.7017211914062, - "completions/min_length": 2.0, - "completions/min_terminated_length": 2.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 842.294921875, + "completions/mean_terminated_length": 805.9053955078125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, "epoch": 0.39054365451907486, - "grad_norm": 3.26562237739563, - "kl": 5.5546875, - "learning_rate": 7.878723442095384e-07, - "loss": 0.3861, - "num_tokens": 666607184.0, - "reward": 1.67236328125, - "reward_std": 0.6616606116294861, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.728515625, - "rewards/format_reward/std": 0.44516023993492126, - "rewards/tag_count_reward/mean": 0.87158203125, - "rewards/tag_count_reward/std": 0.23037032783031464, + "grad_norm": 2.8186049461364746, + "kl": 6.10546875, + "learning_rate": 7.881806056135051e-07, + "loss": 0.4085, + "num_tokens": 741449127.0, + "reward": 0.998046875, + "reward_std": 0.29522985219955444, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.22849668562412262, "step": 1144 }, { @@ -33191,27 +33191,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1862.0, - "completions/mean_length": 775.494140625, - "completions/mean_terminated_length": 739.7208862304688, - "completions/min_length": 32.0, - "completions/min_terminated_length": 32.0, + "completions/max_terminated_length": 1838.0, + "completions/mean_length": 815.560546875, + "completions/mean_terminated_length": 770.6538696289062, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.39088503883246567, - "grad_norm": 2.17079496383667, - "kl": 4.13671875, - "learning_rate": 7.87417090726238e-07, - "loss": 0.2229, - "num_tokens": 667082557.0, - "reward": 1.68994140625, - "reward_std": 0.5987516045570374, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.75390625, - "rewards/format_reward/std": 0.4311550557613373, - "rewards/tag_count_reward/mean": 0.89501953125, - "rewards/tag_count_reward/std": 0.2059287577867508, + "grad_norm": 3.4153337478637695, + "kl": 5.669921875, + "learning_rate": 7.877254081252808e-07, + "loss": 0.3893, + "num_tokens": 741945014.0, + "reward": 0.94677734375, + "reward_std": 0.25482845306396484, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.89990234375, + "rewards/tag_count_reward/std": 0.21922071278095245, "step": 1145 }, { @@ -33220,27 +33220,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 770.181640625, - "completions/mean_terminated_length": 726.2969970703125, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 850.115234375, + "completions/mean_terminated_length": 786.0308227539062, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, "epoch": 0.39122642314585643, - "grad_norm": 2.750588893890381, - "kl": 4.109375, - "learning_rate": 7.869615002728016e-07, - "loss": 0.2714, - "num_tokens": 667559306.0, - "reward": 1.7763671875, - "reward_std": 0.6300910711288452, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.77734375, - "rewards/format_reward/std": 0.41643625497817993, - "rewards/tag_count_reward/mean": 0.9013671875, - "rewards/tag_count_reward/std": 0.20992517471313477, + "grad_norm": 4.803462505340576, + "kl": 6.1484375, + "learning_rate": 7.872698729732716e-07, + "loss": 0.4316, + "num_tokens": 742462689.0, + "reward": 0.990234375, + "reward_std": 0.33957648277282715, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.2392502874135971, "step": 1146 }, { @@ -33249,27 +33249,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 759.791015625, - "completions/mean_terminated_length": 736.7415161132812, - "completions/min_length": 40.0, - "completions/min_terminated_length": 40.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 799.23828125, + "completions/mean_terminated_length": 745.8289794921875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.39156780745924724, - "grad_norm": 1.8268723487854004, - "kl": 4.4921875, - "learning_rate": 7.865055734958566e-07, - "loss": 0.2689, - "num_tokens": 668020495.0, - "reward": 1.75439453125, - "reward_std": 0.6093716621398926, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.78125, - "rewards/format_reward/std": 0.41380295157432556, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.1958935260772705, + "grad_norm": 3.566377878189087, + "kl": 5.38671875, + "learning_rate": 7.868140008045176e-07, + "loss": 0.3758, + "num_tokens": 742944075.0, + "reward": 0.98046875, + "reward_std": 0.27180248498916626, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.21555092930793762, "step": 1147 }, { @@ -33278,27 +33278,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 779.1953125, - "completions/mean_terminated_length": 751.3373413085938, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 830.49609375, + "completions/mean_terminated_length": 773.2310791015625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.39190919177263805, - "grad_norm": 1.9029971361160278, - "kl": 4.5703125, - "learning_rate": 7.860493110425073e-07, - "loss": 0.2848, - "num_tokens": 668499283.0, - "reward": 1.79443359375, - "reward_std": 0.5710021257400513, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.796875, - "rewards/format_reward/std": 0.4027182459831238, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.18915848433971405, + "grad_norm": 4.79673957824707, + "kl": 5.58203125, + "learning_rate": 7.863577922665367e-07, + "loss": 0.4081, + "num_tokens": 743449129.0, + "reward": 0.99365234375, + "reward_std": 0.2952072024345398, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89404296875, + "rewards/tag_count_reward/std": 0.22202295064926147, "step": 1148 }, { @@ -33307,27 +33307,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 802.76171875, - "completions/mean_terminated_length": 767.7550048828125, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 848.419921875, + "completions/mean_terminated_length": 763.0941162109375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.39225057608602887, - "grad_norm": 1.7392467260360718, - "kl": 5.77734375, - "learning_rate": 7.855927135603348e-07, - "loss": 0.3378, - "num_tokens": 668987689.0, - "reward": 1.82861328125, - "reward_std": 0.6556215882301331, - "rewards/accuracy_reward/mean": 0.142578125, - "rewards/accuracy_reward/std": 0.3499840497970581, - "rewards/format_reward/mean": 0.77734375, - "rewards/format_reward/std": 0.41643625497817993, - "rewards/tag_count_reward/mean": 0.90869140625, - "rewards/tag_count_reward/std": 0.1892089992761612, + "grad_norm": 3.0114455223083496, + "kl": 6.0859375, + "learning_rate": 7.859012480073244e-07, + "loss": 0.4538, + "num_tokens": 743960912.0, + "reward": 1.07275390625, + "reward_std": 0.3607943654060364, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.3937928080558777, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88134765625, + "rewards/tag_count_reward/std": 0.24018652737140656, "step": 1149 }, { @@ -33336,27 +33336,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 771.7578125, - "completions/mean_terminated_length": 722.5719604492188, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 798.615234375, + "completions/mean_terminated_length": 720.8527221679688, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.3925919603994196, - "grad_norm": 2.6620638370513916, - "kl": 5.5546875, - "learning_rate": 7.851357816973962e-07, - "loss": 0.2986, - "num_tokens": 669455725.0, - "reward": 1.7470703125, - "reward_std": 0.6239137053489685, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.76171875, - "rewards/format_reward/std": 0.42644867300987244, - "rewards/tag_count_reward/mean": 0.8994140625, - "rewards/tag_count_reward/std": 0.20604762434959412, + "grad_norm": 3.434847116470337, + "kl": 7.3984375, + "learning_rate": 7.854443686753542e-07, + "loss": 0.5611, + "num_tokens": 744442699.0, + "reward": 0.984375, + "reward_std": 0.31129807233810425, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.2318510115146637, "step": 1150 }, { @@ -33365,27 +33365,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 842.150390625, - "completions/mean_terminated_length": 790.576416015625, - "completions/min_length": 8.0, - "completions/min_terminated_length": 8.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 840.9296875, + "completions/mean_terminated_length": 765.8008422851562, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.39293334471281044, - "grad_norm": 3.0081570148468018, - "kl": 6.8984375, - "learning_rate": 7.846785161022223e-07, - "loss": 0.3995, - "num_tokens": 669958394.0, - "reward": 1.734375, - "reward_std": 0.6649159789085388, - "rewards/accuracy_reward/mean": 0.09879032522439957, - "rewards/accuracy_reward/std": 0.2986815273761749, - "rewards/format_reward/mean": 0.74609375, - "rewards/format_reward/std": 0.43567025661468506, - "rewards/tag_count_reward/mean": 0.892578125, - "rewards/tag_count_reward/std": 0.21257804334163666, + "grad_norm": 6.170082092285156, + "kl": 8.3984375, + "learning_rate": 7.849871549195745e-07, + "loss": 0.5741, + "num_tokens": 744944743.0, + "reward": 0.96240234375, + "reward_std": 0.33735060691833496, + "rewards/accuracy_reward/mean": 0.0927419364452362, + "rewards/accuracy_reward/std": 0.2903633117675781, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87255859375, + "rewards/tag_count_reward/std": 0.23666778206825256, "step": 1151 }, { @@ -33394,27 +33394,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 847.01171875, - "completions/mean_terminated_length": 787.9466552734375, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 1906.0, + "completions/mean_length": 823.556640625, + "completions/mean_terminated_length": 744.6423950195312, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.39327472902620125, - "grad_norm": 3.3438761234283447, - "kl": 7.7421875, - "learning_rate": 7.842209174238181e-07, - "loss": 0.4798, - "num_tokens": 670475520.0, - "reward": 1.6552734375, - "reward_std": 0.6544797420501709, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.71875, - "rewards/format_reward/std": 0.45004892349243164, - "rewards/tag_count_reward/mean": 0.8798828125, - "rewards/tag_count_reward/std": 0.23034387826919556, + "grad_norm": 4.138432025909424, + "kl": 7.7734375, + "learning_rate": 7.845296073894092e-07, + "loss": 0.5468, + "num_tokens": 745449860.0, + "reward": 0.95068359375, + "reward_std": 0.31819164752960205, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88037109375, + "rewards/tag_count_reward/std": 0.238677978515625, "step": 1152 }, { @@ -33423,27 +33423,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 768.03515625, - "completions/mean_terminated_length": 745.1331787109375, - "completions/min_length": 65.0, - "completions/min_terminated_length": 65.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 853.947265625, + "completions/mean_terminated_length": 733.258056640625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.39361611333959207, - "grad_norm": 2.016155242919922, - "kl": 5.3203125, - "learning_rate": 7.837629863116612e-07, - "loss": 0.2692, - "num_tokens": 670951250.0, - "reward": 1.78955078125, - "reward_std": 0.5713357925415039, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.796875, - "rewards/format_reward/std": 0.4027182459831238, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.18976370990276337, + "grad_norm": 4.664544105529785, + "kl": 9.0703125, + "learning_rate": 7.840717267347559e-07, + "loss": 0.6875, + "num_tokens": 745969577.0, + "reward": 0.97216796875, + "reward_std": 0.28539618849754333, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88427734375, + "rewards/tag_count_reward/std": 0.22591814398765564, "step": 1153 }, { @@ -33452,27 +33452,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 866.634765625, - "completions/mean_terminated_length": 816.1079711914062, - "completions/min_length": 30.0, - "completions/min_terminated_length": 30.0, + "completions/max_terminated_length": 1849.0, + "completions/mean_length": 882.53125, + "completions/mean_terminated_length": 745.117919921875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, "epoch": 0.3939574976529828, - "grad_norm": 2.2790541648864746, - "kl": 6.4453125, - "learning_rate": 7.833047234157012e-07, - "loss": 0.3606, - "num_tokens": 671463255.0, - "reward": 1.638671875, - "reward_std": 0.6134622097015381, - "rewards/accuracy_reward/mean": 0.0463709682226181, - "rewards/accuracy_reward/std": 0.21049949526786804, - "rewards/format_reward/mean": 0.71875, - "rewards/format_reward/std": 0.45004892349243164, - "rewards/tag_count_reward/mean": 0.875, - "rewards/tag_count_reward/std": 0.2314550280570984, + "grad_norm": 13.123332977294922, + "kl": 11.046875, + "learning_rate": 7.836135136059859e-07, + "loss": 0.6956, + "num_tokens": 746489721.0, + "reward": 0.8935546875, + "reward_std": 0.2952483296394348, + "rewards/accuracy_reward/mean": 0.05040322616696358, + "rewards/accuracy_reward/std": 0.21899643540382385, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8447265625, + "rewards/tag_count_reward/std": 0.2618269622325897, "step": 1154 }, { @@ -33481,27 +33481,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.119140625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 788.60546875, - "completions/mean_terminated_length": 737.4105224609375, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/max_terminated_length": 1968.0, + "completions/mean_length": 862.625, + "completions/mean_terminated_length": 702.297119140625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.39429888196637364, - "grad_norm": 1.8915692567825317, - "kl": 5.0078125, - "learning_rate": 7.828461293863581e-07, - "loss": 0.3307, - "num_tokens": 671943085.0, - "reward": 1.77685546875, - "reward_std": 0.6183324456214905, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.77734375, - "rewards/format_reward/std": 0.41643625497817993, - "rewards/tag_count_reward/mean": 0.90576171875, - "rewards/tag_count_reward/std": 0.2045973688364029, + "grad_norm": 11.421345710754395, + "kl": 10.390625, + "learning_rate": 7.831549686539424e-07, + "loss": 0.6812, + "num_tokens": 747007449.0, + "reward": 0.95458984375, + "reward_std": 0.32081472873687744, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86279296875, + "rewards/tag_count_reward/std": 0.24500371515750885, "step": 1155 }, { @@ -33510,27 +33510,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 827.72265625, - "completions/mean_terminated_length": 780.6936645507812, - "completions/min_length": 233.0, - "completions/min_terminated_length": 233.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 920.2890625, + "completions/mean_terminated_length": 711.4537353515625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, "epoch": 0.39464026627976445, - "grad_norm": 4.631453514099121, - "kl": 4.27734375, - "learning_rate": 7.823872048745223e-07, - "loss": 0.297, - "num_tokens": 672447455.0, - "reward": 1.72265625, - "reward_std": 0.6078072190284729, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.779296875, - "rewards/format_reward/std": 0.4151262938976288, - "rewards/tag_count_reward/mean": 0.8984375, - "rewards/tag_count_reward/std": 0.21714095771312714, + "grad_norm": 11.931161880493164, + "kl": 10.015625, + "learning_rate": 7.826960925299398e-07, + "loss": 0.645, + "num_tokens": 747559213.0, + "reward": 0.90087890625, + "reward_std": 0.27657467126846313, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.85400390625, + "rewards/tag_count_reward/std": 0.2537364363670349, "step": 1156 }, { @@ -33539,27 +33539,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.154296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 812.89453125, - "completions/mean_terminated_length": 783.2520141601562, - "completions/min_length": 239.0, - "completions/min_terminated_length": 239.0, + "completions/max_terminated_length": 1851.0, + "completions/mean_length": 903.94140625, + "completions/mean_terminated_length": 695.2101440429688, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, "epoch": 0.39498165059315526, - "grad_norm": 4.364314079284668, - "kl": 4.75, - "learning_rate": 7.819279505315538e-07, - "loss": 0.3488, - "num_tokens": 672949945.0, - "reward": 1.68310546875, - "reward_std": 0.6080790758132935, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.734375, - "rewards/format_reward/std": 0.44209739565849304, - "rewards/tag_count_reward/mean": 0.89013671875, - "rewards/tag_count_reward/std": 0.2195560783147812, + "grad_norm": 9.49743938446045, + "kl": 8.96875, + "learning_rate": 7.822368858857632e-07, + "loss": 0.7157, + "num_tokens": 748108319.0, + "reward": 0.94580078125, + "reward_std": 0.29803571105003357, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86962890625, + "rewards/tag_count_reward/std": 0.24223853647708893, "step": 1157 }, { @@ -33568,27 +33568,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1900.0, - "completions/mean_length": 804.609375, - "completions/mean_terminated_length": 756.6896362304688, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, - "epoch": 0.395323034906546, - "grad_norm": 3.419736385345459, - "kl": 4.8515625, - "learning_rate": 7.814683670092795e-07, - "loss": 0.3505, - "num_tokens": 673437345.0, - "reward": 1.72314453125, - "reward_std": 0.5805853605270386, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.79296875, - "rewards/format_reward/std": 0.40557438135147095, - "rewards/tag_count_reward/mean": 0.90087890625, - "rewards/tag_count_reward/std": 0.21287843585014343, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 964.943359375, + "completions/mean_terminated_length": 688.8701171875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.395323034906546, + "grad_norm": 5.749622821807861, + "kl": 10.859375, + "learning_rate": 7.817773493736669e-07, + "loss": 0.844, + "num_tokens": 748677810.0, + "reward": 0.8759765625, + "reward_std": 0.2987990379333496, + "rewards/accuracy_reward/mean": 0.033203125, + "rewards/accuracy_reward/std": 0.17934183776378632, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.8408203125, + "rewards/tag_count_reward/std": 0.2613447606563568, "step": 1158 }, { @@ -33597,27 +33597,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.173828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 783.759765625, - "completions/mean_terminated_length": 737.6943359375, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 1817.0, + "completions/mean_length": 893.068359375, + "completions/mean_terminated_length": 650.0685424804688, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.39566441921993684, - "grad_norm": 0.9830715656280518, - "kl": 5.828125, - "learning_rate": 7.810084549599944e-07, - "loss": 0.3455, - "num_tokens": 673911862.0, - "reward": 1.68310546875, - "reward_std": 0.6620784997940063, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.24230584502220154, - "rewards/format_reward/mean": 0.73828125, - "rewards/format_reward/std": 0.44000017642974854, - "rewards/tag_count_reward/mean": 0.88427734375, - "rewards/tag_count_reward/std": 0.22537609934806824, + "grad_norm": 5.397251129150391, + "kl": 9.8671875, + "learning_rate": 7.813174836463741e-07, + "loss": 0.7496, + "num_tokens": 749208293.0, + "reward": 0.93603515625, + "reward_std": 0.33763402700424194, + "rewards/accuracy_reward/mean": 0.08870967477560043, + "rewards/accuracy_reward/std": 0.284611314535141, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.85009765625, + "rewards/tag_count_reward/std": 0.2557843327522278, "step": 1159 }, { @@ -33626,27 +33626,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.130859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 905.68359375, - "completions/mean_terminated_length": 854.3958740234375, - "completions/min_length": 182.0, - "completions/min_terminated_length": 182.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 920.4296875, + "completions/mean_terminated_length": 750.6607055664062, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.39600580353332765, - "grad_norm": 1.4845670461654663, - "kl": 7.3125, - "learning_rate": 7.805482150364598e-07, - "loss": 0.4588, - "num_tokens": 674459092.0, - "reward": 1.64453125, - "reward_std": 0.6831563711166382, - "rewards/accuracy_reward/mean": 0.08064515888690948, - "rewards/accuracy_reward/std": 0.2725643217563629, - "rewards/format_reward/mean": 0.705078125, - "rewards/format_reward/std": 0.4564536213874817, - "rewards/tag_count_reward/mean": 0.861328125, - "rewards/tag_count_reward/std": 0.24741044640541077, + "grad_norm": 3.3826115131378174, + "kl": 9.46875, + "learning_rate": 7.808572893570753e-07, + "loss": 0.6948, + "num_tokens": 749763073.0, + "reward": 0.89892578125, + "reward_std": 0.31057924032211304, + "rewards/accuracy_reward/mean": 0.060483869165182114, + "rewards/accuracy_reward/std": 0.23862183094024658, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.84033203125, + "rewards/tag_count_reward/std": 0.2612803280353546, "step": 1160 }, { @@ -33655,27 +33655,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.11328125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 775.59375, - "completions/mean_terminated_length": 726.5557861328125, - "completions/min_length": 14.0, - "completions/min_terminated_length": 14.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 848.728515625, + "completions/mean_terminated_length": 695.517578125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.39634718784671846, - "grad_norm": 1.3064024448394775, - "kl": 7.1484375, - "learning_rate": 7.800876478919014e-07, - "loss": 0.4384, - "num_tokens": 674928212.0, - "reward": 1.6708984375, - "reward_std": 0.6619834899902344, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.720703125, - "rewards/format_reward/std": 0.44909247756004333, - "rewards/tag_count_reward/mean": 0.8798828125, - "rewards/tag_count_reward/std": 0.22223642468452454, + "grad_norm": 3.58677077293396, + "kl": 9.765625, + "learning_rate": 7.803967671594277e-07, + "loss": 0.6828, + "num_tokens": 750269638.0, + "reward": 0.96826171875, + "reward_std": 0.3027687966823578, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87841796875, + "rewards/tag_count_reward/std": 0.24227403104305267, "step": 1161 }, { @@ -33684,27 +33684,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 840.60546875, - "completions/mean_terminated_length": 788.9653930664062, - "completions/min_length": 240.0, - "completions/min_terminated_length": 240.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 853.681640625, + "completions/mean_terminated_length": 771.40087890625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.3966885721601092, - "grad_norm": 2.027768135070801, - "kl": 6.3515625, - "learning_rate": 7.796267541800106e-07, - "loss": 0.3414, - "num_tokens": 675440698.0, - "reward": 1.67822265625, - "reward_std": 0.6405354738235474, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.71875, - "rewards/format_reward/std": 0.45004892349243164, - "rewards/tag_count_reward/mean": 0.88330078125, - "rewards/tag_count_reward/std": 0.22757430374622345, + "grad_norm": 4.255846977233887, + "kl": 6.8125, + "learning_rate": 7.799359177075546e-07, + "loss": 0.4712, + "num_tokens": 750788819.0, + "reward": 0.99609375, + "reward_std": 0.305789053440094, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.892578125, + "rewards/tag_count_reward/std": 0.22596518695354462, "step": 1162 }, { @@ -33713,27 +33713,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 863.26953125, - "completions/mean_terminated_length": 805.0040283203125, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 838.93359375, + "completions/mean_terminated_length": 797.41015625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, "epoch": 0.39702995647350003, - "grad_norm": 0.8648762702941895, - "kl": 6.640625, - "learning_rate": 7.791655345549416e-07, - "loss": 0.411, - "num_tokens": 675958212.0, - "reward": 1.68798828125, - "reward_std": 0.6656864285469055, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.7265625, - "rewards/format_reward/std": 0.4461594223976135, - "rewards/tag_count_reward/mean": 0.87744140625, - "rewards/tag_count_reward/std": 0.23666778206825256, + "grad_norm": 3.7747159004211426, + "kl": 5.8359375, + "learning_rate": 7.794747416560436e-07, + "loss": 0.3883, + "num_tokens": 751293873.0, + "reward": 1.0263671875, + "reward_std": 0.3258141875267029, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8916015625, + "rewards/tag_count_reward/std": 0.2297956496477127, "step": 1163 }, { @@ -33742,27 +33742,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 789.015625, - "completions/mean_terminated_length": 743.1417236328125, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 758.884765625, + "completions/mean_terminated_length": 725.3005981445312, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.39737134078689085, - "grad_norm": 2.4499430656433105, - "kl": 5.78125, - "learning_rate": 7.78703989671311e-07, - "loss": 0.3827, - "num_tokens": 676439676.0, - "reward": 1.72216796875, - "reward_std": 0.6407417058944702, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.75390625, - "rewards/format_reward/std": 0.4311550557613373, - "rewards/tag_count_reward/mean": 0.89599609375, - "rewards/tag_count_reward/std": 0.21851344406604767, + "grad_norm": 2.5672216415405273, + "kl": 5.734375, + "learning_rate": 7.790132396599467e-07, + "loss": 0.3718, + "num_tokens": 751759910.0, + "reward": 1.0029296875, + "reward_std": 0.28681328892707825, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.193694069981575, "step": 1164 }, { @@ -33771,27 +33771,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 775.470703125, - "completions/mean_terminated_length": 742.3186645507812, - "completions/min_length": 80.0, - "completions/min_terminated_length": 80.0, + "completions/max_terminated_length": 1870.0, + "completions/mean_length": 788.259765625, + "completions/mean_terminated_length": 720.8662109375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.39771272510028166, - "grad_norm": 2.001849889755249, - "kl": 4.80078125, - "learning_rate": 7.782421201841978e-07, - "loss": 0.277, - "num_tokens": 676905613.0, - "reward": 1.78955078125, - "reward_std": 0.5900582075119019, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.2004214972257614, + "grad_norm": 3.756817102432251, + "kl": 6.8125, + "learning_rate": 7.785514123747784e-07, + "loss": 0.4482, + "num_tokens": 752232395.0, + "reward": 0.9619140625, + "reward_std": 0.26857221126556396, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8974609375, + "rewards/tag_count_reward/std": 0.2260729968547821, "step": 1165 }, { @@ -33800,27 +33800,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 866.25, - "completions/mean_terminated_length": 810.6666259765625, - "completions/min_length": 226.0, - "completions/min_terminated_length": 226.0, + "completions/max_terminated_length": 1778.0, + "completions/mean_length": 814.90234375, + "completions/mean_terminated_length": 777.6860961914062, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.3980541094136724, - "grad_norm": 1.3362925052642822, - "kl": 6.625, - "learning_rate": 7.77779926749141e-07, - "loss": 0.3893, - "num_tokens": 677425021.0, - "reward": 1.66748046875, - "reward_std": 0.6531873941421509, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.740234375, - "rewards/format_reward/std": 0.4389347732067108, - "rewards/tag_count_reward/mean": 0.88818359375, - "rewards/tag_count_reward/std": 0.21912479400634766, + "grad_norm": 2.185098886489868, + "kl": 6.49609375, + "learning_rate": 7.780892604565158e-07, + "loss": 0.3951, + "num_tokens": 752725513.0, + "reward": 0.94482421875, + "reward_std": 0.31639647483825684, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88232421875, + "rewards/tag_count_reward/std": 0.24519863724708557, "step": 1166 }, { @@ -33829,27 +33829,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 846.55859375, - "completions/mean_terminated_length": 790.049072265625, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 831.220703125, + "completions/mean_terminated_length": 776.5897827148438, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.39839549372706323, - "grad_norm": 1.2076319456100464, - "kl": 6.578125, - "learning_rate": 7.773174100221398e-07, - "loss": 0.4258, - "num_tokens": 677936427.0, - "reward": 1.7744140625, - "reward_std": 0.6134333610534668, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.9033203125, - "rewards/tag_count_reward/std": 0.2159915715456009, + "grad_norm": 2.6802446842193604, + "kl": 6.46875, + "learning_rate": 7.776267845615964e-07, + "loss": 0.4411, + "num_tokens": 753229066.0, + "reward": 0.96044921875, + "reward_std": 0.2605874836444855, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88037109375, + "rewards/tag_count_reward/std": 0.23918987810611725, "step": 1167 }, { @@ -33858,27 +33858,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 836.38671875, - "completions/mean_terminated_length": 799.8189086914062, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 800.5234375, + "completions/mean_terminated_length": 775.67333984375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.39873687804045405, - "grad_norm": 1.737146019935608, - "kl": 6.1015625, - "learning_rate": 7.768545706596519e-07, - "loss": 0.3958, - "num_tokens": 678439377.0, - "reward": 1.75732421875, - "reward_std": 0.5887085199356079, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.22121235728263855, + "grad_norm": 4.530117988586426, + "kl": 4.24609375, + "learning_rate": 7.771639853469186e-07, + "loss": 0.2849, + "num_tokens": 753713654.0, + "reward": 1.02880859375, + "reward_std": 0.28204211592674255, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.203689306974411, "step": 1168 }, { @@ -33889,25 +33889,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1962.0, - "completions/mean_length": 825.546875, - "completions/mean_terminated_length": 788.65185546875, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/max_terminated_length": 1804.0, + "completions/mean_length": 829.73046875, + "completions/mean_terminated_length": 792.9617309570312, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.39907826235384486, - "grad_norm": 1.1133967638015747, - "kl": 6.1640625, - "learning_rate": 7.763914093185932e-07, - "loss": 0.3867, - "num_tokens": 678936105.0, - "reward": 1.8193359375, - "reward_std": 0.5887194871902466, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.20335890352725983, + "grad_norm": 2.4273905754089355, + "kl": 5.3515625, + "learning_rate": 7.767008634698395e-07, + "loss": 0.3368, + "num_tokens": 754212524.0, + "reward": 1.001953125, + "reward_std": 0.30877771973609924, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.23009692132472992, "step": 1169 }, { @@ -33916,27 +33916,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1937.0, - "completions/mean_length": 848.7578125, - "completions/mean_terminated_length": 794.9142456054688, - "completions/min_length": 46.0, - "completions/min_terminated_length": 46.0, + "completions/max_terminated_length": 1913.0, + "completions/mean_length": 866.51953125, + "completions/mean_terminated_length": 810.9488525390625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.3994196466672356, - "grad_norm": 2.6224238872528076, - "kl": 7.09375, - "learning_rate": 7.759279266563365e-07, - "loss": 0.3943, - "num_tokens": 679445325.0, - "reward": 1.7685546875, - "reward_std": 0.6187150478363037, - "rewards/accuracy_reward/mean": 0.08467742055654526, - "rewards/accuracy_reward/std": 0.278682142496109, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.9013671875, - "rewards/tag_count_reward/std": 0.21050699055194855, + "grad_norm": 2.5649712085723877, + "kl": 6.421875, + "learning_rate": 7.76237419588175e-07, + "loss": 0.4086, + "num_tokens": 754730838.0, + "reward": 0.9580078125, + "reward_std": 0.32649922370910645, + "rewards/accuracy_reward/mean": 0.0927419364452362, + "rewards/accuracy_reward/std": 0.2903633117675781, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.8662109375, + "rewards/tag_count_reward/std": 0.25493350625038147, "step": 1170 }, { @@ -33945,27 +33945,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 787.0078125, - "completions/mean_terminated_length": 751.5582275390625, - "completions/min_length": 80.0, - "completions/min_terminated_length": 80.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 784.474609375, + "completions/mean_terminated_length": 743.7156982421875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.39976103098062643, - "grad_norm": 3.6764135360717773, - "kl": 8.1484375, - "learning_rate": 7.754641233307109e-07, - "loss": 0.4696, - "num_tokens": 679928801.0, - "reward": 1.724609375, - "reward_std": 0.6647849678993225, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.763671875, - "rewards/format_reward/std": 0.42524150013923645, - "rewards/tag_count_reward/mean": 0.884765625, - "rewards/tag_count_reward/std": 0.2275160402059555, + "grad_norm": 2.182196617126465, + "kl": 6.3515625, + "learning_rate": 7.757736543601977e-07, + "loss": 0.4287, + "num_tokens": 755213017.0, + "reward": 0.9736328125, + "reward_std": 0.3208127021789551, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8720703125, + "rewards/tag_count_reward/std": 0.24379020929336548, "step": 1171 }, { @@ -33974,27 +33974,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 796.189453125, - "completions/mean_terminated_length": 745.3027954101562, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 794.353515625, + "completions/mean_terminated_length": 751.2990112304688, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.40010241529401724, - "grad_norm": 3.195155620574951, - "kl": 8.125, - "learning_rate": 7.75e-07, - "loss": 0.4878, - "num_tokens": 680419746.0, - "reward": 1.748046875, - "reward_std": 0.5978066325187683, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.24230584502220154, - "rewards/format_reward/mean": 0.783203125, - "rewards/format_reward/std": 0.4124660789966583, - "rewards/tag_count_reward/mean": 0.904296875, - "rewards/tag_count_reward/std": 0.21358256042003632, + "grad_norm": 3.6284797191619873, + "kl": 6.7578125, + "learning_rate": 7.753095684446373e-07, + "loss": 0.4248, + "num_tokens": 755703022.0, + "reward": 0.92724609375, + "reward_std": 0.3194776177406311, + "rewards/accuracy_reward/mean": 0.07056451588869095, + "rewards/accuracy_reward/std": 0.25635460019111633, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.85888671875, + "rewards/tag_count_reward/std": 0.25361213088035583, "step": 1172 }, { @@ -34003,27 +34003,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1926.0, - "completions/mean_length": 852.396484375, - "completions/mean_terminated_length": 818.7850952148438, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 895.427734375, + "completions/mean_terminated_length": 828.7499389648438, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.40044379960740806, - "grad_norm": 2.44873046875, - "kl": 7.609375, - "learning_rate": 7.745355573229422e-07, - "loss": 0.4326, - "num_tokens": 680937485.0, - "reward": 1.74169921875, - "reward_std": 0.6479066610336304, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.775390625, - "rewards/format_reward/std": 0.41773295402526855, - "rewards/tag_count_reward/mean": 0.90380859375, - "rewards/tag_count_reward/std": 0.2096034586429596, + "grad_norm": 2.316324472427368, + "kl": 7.828125, + "learning_rate": 7.748451625006786e-07, + "loss": 0.4977, + "num_tokens": 756242793.0, + "reward": 0.91064453125, + "reward_std": 0.3320344090461731, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.85009765625, + "rewards/tag_count_reward/std": 0.26099124550819397, "step": 1173 }, { @@ -34032,27 +34032,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1924.0, - "completions/mean_length": 786.751953125, - "completions/mean_terminated_length": 753.893798828125, - "completions/min_length": 38.0, - "completions/min_terminated_length": 38.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 817.05859375, + "completions/mean_terminated_length": 795.0337524414062, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, "epoch": 0.4007851839207988, - "grad_norm": 1.5025852918624878, - "kl": 6.1875, - "learning_rate": 7.740707959587289e-07, - "loss": 0.346, - "num_tokens": 681412910.0, - "reward": 1.81494140625, - "reward_std": 0.6183757781982422, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.79296875, - "rewards/format_reward/std": 0.40557438135147095, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.2058405727148056, + "grad_norm": 5.872766971588135, + "kl": 7.78125, + "learning_rate": 7.743804371879612e-07, + "loss": 0.4797, + "num_tokens": 756733735.0, + "reward": 0.97119140625, + "reward_std": 0.3485479950904846, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.85595703125, + "rewards/tag_count_reward/std": 0.25485244393348694, "step": 1174 }, { @@ -34063,25 +34063,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 818.048828125, - "completions/mean_terminated_length": 786.0060424804688, - "completions/min_length": 17.0, - "completions/min_terminated_length": 17.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 804.693359375, + "completions/mean_terminated_length": 772.3026123046875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.40112656823418963, - "grad_norm": 2.9225752353668213, - "kl": 4.734375, - "learning_rate": 7.736057165670038e-07, - "loss": 0.2795, - "num_tokens": 681906071.0, - "reward": 1.763671875, - "reward_std": 0.5842170715332031, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.19852031767368317, + "grad_norm": 2.7498745918273926, + "kl": 5.96875, + "learning_rate": 7.739153931665782e-07, + "loss": 0.3716, + "num_tokens": 757220058.0, + "reward": 0.88720703125, + "reward_std": 0.27300506830215454, + "rewards/accuracy_reward/mean": 0.02734375, + "rewards/accuracy_reward/std": 0.16324250400066376, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.85986328125, + "rewards/tag_count_reward/std": 0.2584489583969116, "step": 1175 }, { @@ -34090,27 +34090,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 901.255859375, - "completions/mean_terminated_length": 839.9074096679688, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 882.146484375, + "completions/mean_terminated_length": 849.3714599609375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.40146795254758044, - "grad_norm": 1.8672685623168945, - "kl": 5.1953125, - "learning_rate": 7.731403198078622e-07, - "loss": 0.3463, - "num_tokens": 682441722.0, - "reward": 1.70166015625, - "reward_std": 0.608293354511261, + "grad_norm": 4.386836051940918, + "kl": 5.7734375, + "learning_rate": 7.734500310970753e-07, + "loss": 0.3415, + "num_tokens": 757745925.0, + "reward": 0.892578125, + "reward_std": 0.3077738285064697, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.76953125, - "rewards/format_reward/std": 0.42154473066329956, - "rewards/tag_count_reward/mean": 0.89111328125, - "rewards/tag_count_reward/std": 0.22225522994995117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.2587745785713196, "step": 1176 }, { @@ -34122,24 +34122,24 @@ "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, - "completions/mean_length": 847.671875, - "completions/mean_terminated_length": 823.760986328125, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/mean_length": 864.80078125, + "completions/mean_terminated_length": 841.2310791015625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, "epoch": 0.40180933686097126, - "grad_norm": 3.887122631072998, - "kl": 3.65234375, - "learning_rate": 7.726746063418493e-07, - "loss": 0.2463, - "num_tokens": 682950146.0, - "reward": 1.8203125, - "reward_std": 0.5349379777908325, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.17528937757015228, + "grad_norm": 5.974264144897461, + "kl": 5.21875, + "learning_rate": 7.729843516404501e-07, + "loss": 0.3371, + "num_tokens": 758263119.0, + "reward": 0.90771484375, + "reward_std": 0.29222816228866577, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.84912109375, + "rewards/tag_count_reward/std": 0.2632296085357666, "step": 1177 }, { @@ -34148,27 +34148,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 901.046875, - "completions/mean_terminated_length": 829.6597900390625, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 817.837890625, + "completions/mean_terminated_length": 800.7861938476562, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.402150721174362, - "grad_norm": 1.545910358428955, - "kl": 6.703125, - "learning_rate": 7.722085768299608e-07, - "loss": 0.4523, - "num_tokens": 683491658.0, - "reward": 1.6962890625, - "reward_std": 0.620068371295929, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.748046875, - "rewards/format_reward/std": 0.43455907702445984, - "rewards/tag_count_reward/mean": 0.8759765625, - "rewards/tag_count_reward/std": 0.23719459772109985, + "grad_norm": 3.2892820835113525, + "kl": 4.6953125, + "learning_rate": 7.725183554581513e-07, + "loss": 0.2951, + "num_tokens": 758762028.0, + "reward": 0.95751953125, + "reward_std": 0.32326188683509827, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86181640625, + "rewards/tag_count_reward/std": 0.25808268785476685, "step": 1178 }, { @@ -34177,27 +34177,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 910.20703125, - "completions/mean_terminated_length": 839.3900756835938, - "completions/min_length": 48.0, - "completions/min_terminated_length": 48.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 821.431640625, + "completions/mean_terminated_length": 814.202392578125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.4024921054877528, - "grad_norm": 1.2167710065841675, - "kl": 6.359375, - "learning_rate": 7.717422319336398e-07, - "loss": 0.3604, - "num_tokens": 684033188.0, - "reward": 1.66552734375, - "reward_std": 0.6231434345245361, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.736328125, - "rewards/format_reward/std": 0.4410543739795685, - "rewards/tag_count_reward/mean": 0.87255859375, - "rewards/tag_count_reward/std": 0.23459148406982422, + "grad_norm": 4.815431594848633, + "kl": 4.08984375, + "learning_rate": 7.720520432120768e-07, + "loss": 0.2037, + "num_tokens": 759258105.0, + "reward": 0.93115234375, + "reward_std": 0.3200600743293762, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.86865234375, + "rewards/tag_count_reward/std": 0.2467176616191864, "step": 1179 }, { @@ -34206,27 +34206,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1965.0, - "completions/mean_length": 857.51171875, - "completions/mean_terminated_length": 788.6404418945312, - "completions/min_length": 220.0, - "completions/min_terminated_length": 220.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 857.03515625, + "completions/mean_terminated_length": 847.657470703125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, "epoch": 0.40283348980114364, - "grad_norm": 1.1367852687835693, - "kl": 5.72265625, - "learning_rate": 7.712755723147777e-07, - "loss": 0.3528, - "num_tokens": 684550506.0, - "reward": 1.73779296875, - "reward_std": 0.5876256227493286, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.775390625, - "rewards/format_reward/std": 0.41773295402526855, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.21218155324459076, + "grad_norm": 6.9967522621154785, + "kl": 4.4296875, + "learning_rate": 7.715854155645739e-07, + "loss": 0.2963, + "num_tokens": 759775179.0, + "reward": 0.93310546875, + "reward_std": 0.30739641189575195, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86865234375, + "rewards/tag_count_reward/std": 0.24721291661262512, "step": 1180 }, { @@ -34235,27 +34235,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 803.064453125, - "completions/mean_terminated_length": 765.4909057617188, - "completions/min_length": 205.0, - "completions/min_terminated_length": 205.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 770.66015625, + "completions/mean_terminated_length": 760.6023559570312, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, "epoch": 0.40317487411453445, - "grad_norm": 2.600008964538574, - "kl": 5.7890625, - "learning_rate": 7.708085986357127e-07, - "loss": 0.4062, - "num_tokens": 685030939.0, - "reward": 1.81005859375, - "reward_std": 0.5699402689933777, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.19085827469825745, + "grad_norm": 2.918065071105957, + "kl": 5.546875, + "learning_rate": 7.711184731784378e-07, + "loss": 0.3201, + "num_tokens": 760239021.0, + "reward": 0.955078125, + "reward_std": 0.3410775363445282, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.865234375, + "rewards/tag_count_reward/std": 0.25200241804122925, "step": 1181 }, { @@ -34264,27 +34264,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 826.353515625, - "completions/mean_terminated_length": 774.1038818359375, - "completions/min_length": 15.0, - "completions/min_terminated_length": 15.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 785.2578125, + "completions/mean_terminated_length": 772.8047485351562, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, "epoch": 0.4035162584279252, - "grad_norm": 1.3960096836090088, - "kl": 6.8046875, - "learning_rate": 7.703413115592282e-07, - "loss": 0.4225, - "num_tokens": 685530832.0, - "reward": 1.6708984375, - "reward_std": 0.6258046627044678, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.736328125, - "rewards/format_reward/std": 0.4410543739795685, - "rewards/tag_count_reward/mean": 0.8916015625, - "rewards/tag_count_reward/std": 0.2188919186592102, + "grad_norm": 2.6244304180145264, + "kl": 5.734375, + "learning_rate": 7.70651216716911e-07, + "loss": 0.3058, + "num_tokens": 760717873.0, + "reward": 0.9072265625, + "reward_std": 0.31095027923583984, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8544921875, + "rewards/tag_count_reward/std": 0.2503800094127655, "step": 1182 }, { @@ -34293,27 +34293,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 852.904296875, - "completions/mean_terminated_length": 814.352783203125, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 781.34375, + "completions/mean_terminated_length": 776.3765258789062, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.403857642741316, - "grad_norm": 2.728266477584839, - "kl": 7.3125, - "learning_rate": 7.69873711748553e-07, - "loss": 0.4203, - "num_tokens": 686046911.0, - "reward": 1.6982421875, - "reward_std": 0.6647390127182007, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.72265625, - "rewards/format_reward/std": 0.4481254518032074, - "rewards/tag_count_reward/mean": 0.8818359375, - "rewards/tag_count_reward/std": 0.22654588520526886, + "grad_norm": 3.8671646118164062, + "kl": 5.6640625, + "learning_rate": 7.70183646843681e-07, + "loss": 0.2649, + "num_tokens": 761197313.0, + "reward": 0.93115234375, + "reward_std": 0.31549352407455444, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86083984375, + "rewards/tag_count_reward/std": 0.24984276294708252, "step": 1183 }, { @@ -34322,27 +34322,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 816.693359375, - "completions/mean_terminated_length": 761.41015625, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 787.080078125, + "completions/mean_terminated_length": 774.6449584960938, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.40419902705470684, - "grad_norm": 3.1669857501983643, - "kl": 7.453125, - "learning_rate": 7.694057998673597e-07, - "loss": 0.4283, - "num_tokens": 686544418.0, - "reward": 1.67333984375, - "reward_std": 0.7041476964950562, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.70703125, - "rewards/format_reward/std": 0.455569326877594, - "rewards/tag_count_reward/mean": 0.87451171875, - "rewards/tag_count_reward/std": 0.22772540152072906, + "grad_norm": 9.735507011413574, + "kl": 7.703125, + "learning_rate": 7.697157642228826e-07, + "loss": 0.3966, + "num_tokens": 761679658.0, + "reward": 0.9013671875, + "reward_std": 0.3861374258995056, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8115234375, + "rewards/tag_count_reward/std": 0.2843332886695862, "step": 1184 }, { @@ -34351,27 +34351,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1923.0, - "completions/mean_length": 786.220703125, - "completions/mean_terminated_length": 753.3486938476562, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 795.5703125, + "completions/mean_terminated_length": 783.2189331054688, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, "epoch": 0.40454041136809765, - "grad_norm": 1.4147940874099731, - "kl": 4.953125, - "learning_rate": 7.68937576579763e-07, - "loss": 0.2801, - "num_tokens": 687030563.0, - "reward": 1.7041015625, - "reward_std": 0.6220543384552002, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.74609375, - "rewards/format_reward/std": 0.43567025661468506, - "rewards/tag_count_reward/mean": 0.8955078125, - "rewards/tag_count_reward/std": 0.2094147503376007, + "grad_norm": 9.291679382324219, + "kl": 6.5625, + "learning_rate": 7.692475695190924e-07, + "loss": 0.3444, + "num_tokens": 762170590.0, + "reward": 0.90185546875, + "reward_std": 0.31567996740341187, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83154296875, + "rewards/tag_count_reward/std": 0.26829564571380615, "step": 1185 }, { @@ -34380,27 +34380,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 775.755859375, - "completions/mean_terminated_length": 734.7156982421875, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 752.94921875, + "completions/mean_terminated_length": 745.3163452148438, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.4048817956814884, - "grad_norm": 2.470407485961914, - "kl": 5.203125, - "learning_rate": 7.684690425503208e-07, - "loss": 0.3382, - "num_tokens": 687506742.0, - "reward": 1.76171875, - "reward_std": 0.5721713304519653, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.7734375, - "rewards/format_reward/std": 0.4190165400505066, - "rewards/tag_count_reward/mean": 0.908203125, - "rewards/tag_count_reward/std": 0.19997821748256683, + "grad_norm": 3.5935633182525635, + "kl": 6.65625, + "learning_rate": 7.68779063397332e-07, + "loss": 0.385, + "num_tokens": 762635092.0, + "reward": 0.91552734375, + "reward_std": 0.32506632804870605, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.84326171875, + "rewards/tag_count_reward/std": 0.26305168867111206, "step": 1186 }, { @@ -34409,27 +34409,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 759.37109375, - "completions/mean_terminated_length": 725.7996215820312, - "completions/min_length": 86.0, - "completions/min_terminated_length": 86.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 739.61328125, + "completions/mean_terminated_length": 734.482421875, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.4052231799948792, - "grad_norm": 1.7808350324630737, - "kl": 4.9609375, - "learning_rate": 7.680001984440312e-07, - "loss": 0.3089, - "num_tokens": 687975044.0, - "reward": 1.77392578125, - "reward_std": 0.6071911454200745, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.7734375, - "rewards/format_reward/std": 0.4190165400505066, - "rewards/tag_count_reward/mean": 0.90478515625, - "rewards/tag_count_reward/std": 0.20294204354286194, + "grad_norm": 2.4937756061553955, + "kl": 5.84375, + "learning_rate": 7.683102465230648e-07, + "loss": 0.3251, + "num_tokens": 763093278.0, + "reward": 0.93115234375, + "reward_std": 0.3432146906852722, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.84130859375, + "rewards/tag_count_reward/std": 0.26924487948417664, "step": 1187 }, { @@ -34438,27 +34438,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 792.6484375, - "completions/mean_terminated_length": 733.603271484375, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 742.21484375, + "completions/mean_terminated_length": 739.6594848632812, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.40556456430827004, - "grad_norm": 1.2724409103393555, - "kl": 6.765625, - "learning_rate": 7.675310449263326e-07, - "loss": 0.42, - "num_tokens": 688453536.0, - "reward": 1.671875, - "reward_std": 0.6647064685821533, - "rewards/accuracy_reward/mean": 0.05416666716337204, - "rewards/accuracy_reward/std": 0.22658243775367737, - "rewards/format_reward/mean": 0.734375, - "rewards/format_reward/std": 0.44209739565849304, - "rewards/tag_count_reward/mean": 0.88671875, - "rewards/tag_count_reward/std": 0.21696485579013824, + "grad_norm": 2.254314661026001, + "kl": 6.1484375, + "learning_rate": 7.678411195621953e-07, + "loss": 0.3242, + "num_tokens": 763545948.0, + "reward": 0.8701171875, + "reward_std": 0.35820671916007996, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24231401085853577, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8115234375, + "rewards/tag_count_reward/std": 0.2847631275653839, "step": 1188 }, { @@ -34467,27 +34467,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 827.001953125, - "completions/mean_terminated_length": 774.7800903320312, - "completions/min_length": 201.0, - "completions/min_terminated_length": 201.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 764.650390625, + "completions/mean_terminated_length": 762.138916015625, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, "epoch": 0.40590594862166085, - "grad_norm": 1.154120683670044, - "kl": 6.21484375, - "learning_rate": 7.670615826631027e-07, - "loss": 0.3925, - "num_tokens": 688956433.0, - "reward": 1.74755859375, - "reward_std": 0.6169498562812805, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.763671875, - "rewards/format_reward/std": 0.42524150013923645, - "rewards/tag_count_reward/mean": 0.89013671875, - "rewards/tag_count_reward/std": 0.22989697754383087, + "grad_norm": 5.211688041687012, + "kl": 5.3671875, + "learning_rate": 7.673716831810688e-07, + "loss": 0.2997, + "num_tokens": 764016921.0, + "reward": 0.93505859375, + "reward_std": 0.35613590478897095, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.82177734375, + "rewards/tag_count_reward/std": 0.27732184529304504, "step": 1189 }, { @@ -34496,27 +34496,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 839.15625, - "completions/mean_terminated_length": 810.14404296875, - "completions/min_length": 77.0, - "completions/min_terminated_length": 77.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 799.658203125, + "completions/mean_terminated_length": 797.2152709960938, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.4062473329350516, - "grad_norm": 1.295357584953308, - "kl": 5.26953125, - "learning_rate": 7.665918123206572e-07, - "loss": 0.3226, - "num_tokens": 689454913.0, - "reward": 1.77294921875, - "reward_std": 0.5832884311676025, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.19999195635318756, + "grad_norm": 3.489217519760132, + "kl": 5.765625, + "learning_rate": 7.669019380464703e-07, + "loss": 0.2956, + "num_tokens": 764495178.0, + "reward": 0.92431640625, + "reward_std": 0.35767507553100586, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83447265625, + "rewards/tag_count_reward/std": 0.27192169427871704, "step": 1190 }, { @@ -34525,27 +34525,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1758.0, - "completions/mean_length": 767.30859375, - "completions/mean_terminated_length": 728.6558837890625, - "completions/min_length": 39.0, - "completions/min_terminated_length": 39.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1839.0, + "completions/max_terminated_length": 1839.0, + "completions/mean_length": 735.388671875, + "completions/mean_terminated_length": 735.388671875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, "epoch": 0.4065887172484424, - "grad_norm": 2.6538870334625244, - "kl": 5.484375, - "learning_rate": 7.661217345657495e-07, - "loss": 0.3574, - "num_tokens": 689918015.0, - "reward": 1.81005859375, - "reward_std": 0.6497660279273987, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.80078125, - "rewards/format_reward/std": 0.39980348944664, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.21019525825977325, + "grad_norm": 1.9683349132537842, + "kl": 4.83203125, + "learning_rate": 7.664318848256226e-07, + "loss": 0.234, + "num_tokens": 764941937.0, + "reward": 0.94287109375, + "reward_std": 0.3618546426296234, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.84326171875, + "rewards/tag_count_reward/std": 0.27174240350723267, "step": 1191 }, { @@ -34554,27 +34554,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 833.564453125, - "completions/mean_terminated_length": 768.5946044921875, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/max_terminated_length": 1694.0, + "completions/mean_length": 766.26953125, + "completions/mean_terminated_length": 763.76123046875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.40693010156183324, - "grad_norm": 1.9779895544052124, - "kl": 6.890625, - "learning_rate": 7.656513500655688e-07, - "loss": 0.4104, - "num_tokens": 690418768.0, - "reward": 1.75927734375, - "reward_std": 0.5864324569702148, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.787109375, - "rewards/format_reward/std": 0.409751296043396, - "rewards/tag_count_reward/mean": 0.89404296875, - "rewards/tag_count_reward/std": 0.23066876828670502, + "grad_norm": 1.6535422801971436, + "kl": 4.8125, + "learning_rate": 7.659615241861867e-07, + "loss": 0.2081, + "num_tokens": 765408235.0, + "reward": 0.92578125, + "reward_std": 0.3301663398742676, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.268602192401886, "step": 1192 }, { @@ -34583,27 +34583,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 828.177734375, - "completions/mean_terminated_length": 781.1663208007812, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 1919.0, + "completions/mean_length": 746.6015625, + "completions/mean_terminated_length": 744.0548095703125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.40727148587522405, - "grad_norm": 0.8671897053718567, - "kl": 6.2265625, - "learning_rate": 7.651806594877397e-07, - "loss": 0.3636, - "num_tokens": 690916587.0, - "reward": 1.74658203125, - "reward_std": 0.6254327297210693, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.77734375, - "rewards/format_reward/std": 0.41643625497817993, - "rewards/tag_count_reward/mean": 0.89697265625, - "rewards/tag_count_reward/std": 0.22394701838493347, + "grad_norm": 2.5272395610809326, + "kl": 5.484375, + "learning_rate": 7.654908567962601e-07, + "loss": 0.2769, + "num_tokens": 765864287.0, + "reward": 0.91845703125, + "reward_std": 0.3244733214378357, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83642578125, + "rewards/tag_count_reward/std": 0.2704029083251953, "step": 1193 }, { @@ -34612,27 +34612,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1842.0, - "completions/mean_length": 785.36328125, - "completions/mean_terminated_length": 736.7017822265625, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/max_terminated_length": 1556.0, + "completions/mean_length": 705.248046875, + "completions/mean_terminated_length": 699.982421875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, "epoch": 0.4076128701886148, - "grad_norm": 1.2734403610229492, - "kl": 6.390625, - "learning_rate": 7.647096635003216e-07, - "loss": 0.3881, - "num_tokens": 691391349.0, - "reward": 1.810546875, - "reward_std": 0.6113005876541138, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.908203125, - "rewards/tag_count_reward/std": 0.20718760788440704, + "grad_norm": 2.885803461074829, + "kl": 5.3046875, + "learning_rate": 7.650198833243762e-07, + "loss": 0.228, + "num_tokens": 766298030.0, + "reward": 0.9228515625, + "reward_std": 0.3407388925552368, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8466796875, + "rewards/tag_count_reward/std": 0.26111066341400146, "step": 1194 }, { @@ -34641,27 +34641,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 870.53515625, - "completions/mean_terminated_length": 822.6707153320312, - "completions/min_length": 272.0, - "completions/min_terminated_length": 272.0, + "completions/max_terminated_length": 1883.0, + "completions/mean_length": 764.3984375, + "completions/mean_terminated_length": 761.886474609375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.4079542545020056, - "grad_norm": 2.4065842628479004, - "kl": 6.4453125, - "learning_rate": 7.642383627718072e-07, - "loss": 0.4508, - "num_tokens": 691920327.0, - "reward": 1.802734375, - "reward_std": 0.5818156003952026, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.91015625, - "rewards/tag_count_reward/std": 0.21326914429664612, + "grad_norm": 3.4445908069610596, + "kl": 5.921875, + "learning_rate": 7.645486044395029e-07, + "loss": 0.266, + "num_tokens": 766772666.0, + "reward": 0.8935546875, + "reward_std": 0.3538340926170349, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8056640625, + "rewards/tag_count_reward/std": 0.28078925609588623, "step": 1195 }, { @@ -34670,27 +34670,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 776.951171875, - "completions/mean_terminated_length": 717.1676635742188, - "completions/min_length": 262.0, - "completions/min_terminated_length": 262.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1872.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 701.73046875, + "completions/mean_terminated_length": 701.73046875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.40829563881539643, - "grad_norm": 1.3699885606765747, - "kl": 6.4375, - "learning_rate": 7.637667579711215e-07, - "loss": 0.4272, - "num_tokens": 692395326.0, - "reward": 1.8544921875, - "reward_std": 0.5869347453117371, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.20006181299686432, + "grad_norm": 1.889304280281067, + "kl": 4.921875, + "learning_rate": 7.640770208110419e-07, + "loss": 0.2358, + "num_tokens": 767209152.0, + "reward": 0.95947265625, + "reward_std": 0.34182602167129517, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86376953125, + "rewards/tag_count_reward/std": 0.243045836687088, "step": 1196 }, { @@ -34699,27 +34699,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 853.14453125, - "completions/mean_terminated_length": 794.381103515625, - "completions/min_length": 298.0, - "completions/min_terminated_length": 298.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1954.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 750.609375, + "completions/mean_terminated_length": 750.609375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, "epoch": 0.40863702312878725, - "grad_norm": 1.542723536491394, - "kl": 6.453125, - "learning_rate": 7.632948497676213e-07, - "loss": 0.3742, - "num_tokens": 692917336.0, - "reward": 1.77783203125, - "reward_std": 0.6210418343544006, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.90087890625, - "rewards/tag_count_reward/std": 0.2168629914522171, + "grad_norm": 2.724968910217285, + "kl": 4.89453125, + "learning_rate": 7.636051331088277e-07, + "loss": 0.2089, + "num_tokens": 767678664.0, + "reward": 0.94580078125, + "reward_std": 0.3378886580467224, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.85205078125, + "rewards/tag_count_reward/std": 0.2569212019443512, "step": 1197 }, { @@ -34728,27 +34728,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.068359375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 871.080078125, - "completions/mean_terminated_length": 784.7232666015625, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 1903.0, + "completions/mean_length": 745.890625, + "completions/mean_terminated_length": 743.3424682617188, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.408978407442178, - "grad_norm": 1.8710319995880127, - "kl": 7.984375, - "learning_rate": 7.62822638831094e-07, - "loss": 0.4869, - "num_tokens": 693438993.0, - "reward": 1.7080078125, - "reward_std": 0.5856927037239075, - "rewards/accuracy_reward/mean": 0.024193547666072845, - "rewards/accuracy_reward/std": 0.15380479395389557, - "rewards/format_reward/mean": 0.7890625, - "rewards/format_reward/std": 0.4083731174468994, - "rewards/tag_count_reward/mean": 0.8955078125, - "rewards/tag_count_reward/std": 0.2321290373802185, + "grad_norm": 2.645230293273926, + "kl": 4.12890625, + "learning_rate": 7.63132942003127e-07, + "loss": 0.1829, + "num_tokens": 768136224.0, + "reward": 0.896484375, + "reward_std": 0.3052387833595276, + "rewards/accuracy_reward/mean": 0.04435483738780022, + "rewards/accuracy_reward/std": 0.2060900777578354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.853515625, + "rewards/tag_count_reward/std": 0.2556166350841522, "step": 1198 }, { @@ -34757,27 +34757,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1769.0, - "completions/mean_length": 763.900390625, - "completions/mean_terminated_length": 748.6739501953125, - "completions/min_length": 67.0, - "completions/min_terminated_length": 67.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1777.0, + "completions/max_terminated_length": 1777.0, + "completions/mean_length": 730.513671875, + "completions/mean_terminated_length": 730.513671875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, "epoch": 0.4093197917555688, - "grad_norm": 1.8708481788635254, - "kl": 3.703125, - "learning_rate": 7.623501258317567e-07, - "loss": 0.2097, - "num_tokens": 693899566.0, - "reward": 1.88134765625, - "reward_std": 0.5026733875274658, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93994140625, - "rewards/tag_count_reward/std": 0.1732655018568039, + "grad_norm": 1.8920918703079224, + "kl": 4.171875, + "learning_rate": 7.626604481646375e-07, + "loss": 0.1768, + "num_tokens": 768579703.0, + "reward": 0.94970703125, + "reward_std": 0.29629284143447876, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86572265625, + "rewards/tag_count_reward/std": 0.2380446046590805, "step": 1199 }, { @@ -34786,27 +34786,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1914.0, - "completions/mean_length": 834.232421875, - "completions/mean_terminated_length": 782.31982421875, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 764.498046875, + "completions/mean_terminated_length": 759.4647216796875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.40966117606895963, - "grad_norm": 1.1944559812545776, - "kl": 7.3984375, - "learning_rate": 7.618773114402554e-07, - "loss": 0.4642, - "num_tokens": 694406853.0, - "reward": 1.7294921875, - "reward_std": 0.5911108255386353, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.9033203125, - "rewards/tag_count_reward/std": 0.21824489533901215, + "grad_norm": 1.9259616136550903, + "kl": 3.3203125, + "learning_rate": 7.621876522644863e-07, + "loss": 0.1293, + "num_tokens": 769051286.0, + "reward": 0.943359375, + "reward_std": 0.29152172803878784, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.888671875, + "rewards/tag_count_reward/std": 0.22623565793037415, "step": 1200 }, { @@ -34815,27 +34815,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 777.416015625, - "completions/mean_terminated_length": 725.7662353515625, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1901.0, + "completions/max_terminated_length": 1901.0, + "completions/mean_length": 691.71484375, + "completions/mean_terminated_length": 691.71484375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.41000256038235044, - "grad_norm": 1.3154793977737427, - "kl": 7.0703125, - "learning_rate": 7.61404196327663e-07, - "loss": 0.4424, - "num_tokens": 694879066.0, - "reward": 1.80615234375, - "reward_std": 0.5888994932174683, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.90576171875, - "rewards/tag_count_reward/std": 0.21280211210250854, + "grad_norm": 3.8292248249053955, + "kl": 2.91015625, + "learning_rate": 7.617145549742302e-07, + "loss": 0.1257, + "num_tokens": 769479620.0, + "reward": 1.02978515625, + "reward_std": 0.29510825872421265, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89892578125, + "rewards/tag_count_reward/std": 0.2055107206106186, "step": 1201 }, { @@ -34844,27 +34844,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 830.302734375, - "completions/mean_terminated_length": 778.2220458984375, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 1838.0, + "completions/mean_length": 725.02734375, + "completions/mean_terminated_length": 722.4383544921875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.41034394469574126, - "grad_norm": 1.958497405052185, - "kl": 6.9296875, - "learning_rate": 7.609307811654804e-07, - "loss": 0.4531, - "num_tokens": 695380469.0, - "reward": 1.75, - "reward_std": 0.55804443359375, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.91015625, - "rewards/tag_count_reward/std": 0.2062724232673645, + "grad_norm": 1.8830933570861816, + "kl": 2.30078125, + "learning_rate": 7.612411569658539e-07, + "loss": 0.083, + "num_tokens": 769927122.0, + "reward": 0.97802734375, + "reward_std": 0.2674487233161926, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.1895672231912613, "step": 1202 }, { @@ -34873,27 +34873,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 885.5234375, - "completions/mean_terminated_length": 825.8480834960938, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 807.830078125, + "completions/mean_terminated_length": 802.9667358398438, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.410685329009132, - "grad_norm": 1.5910775661468506, - "kl": 7.1796875, - "learning_rate": 7.604570666256339e-07, - "loss": 0.4556, - "num_tokens": 695915137.0, - "reward": 1.73779296875, - "reward_std": 0.5948113203048706, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.90380859375, - "rewards/tag_count_reward/std": 0.2226203829050064, + "grad_norm": 2.0479135513305664, + "kl": 1.978515625, + "learning_rate": 7.607674589117691e-07, + "loss": 0.0837, + "num_tokens": 770422011.0, + "reward": 0.97412109375, + "reward_std": 0.24014775454998016, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.19363176822662354, "step": 1203 }, { @@ -34902,27 +34902,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 792.06640625, - "completions/mean_terminated_length": 748.933349609375, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/max_terminated_length": 1753.0, + "completions/mean_length": 728.642578125, + "completions/mean_terminated_length": 726.0606689453125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.41102671332252283, - "grad_norm": 1.8395967483520508, - "kl": 6.5546875, - "learning_rate": 7.599830533804741e-07, - "loss": 0.383, - "num_tokens": 696399315.0, - "reward": 1.81103515625, - "reward_std": 0.5429419875144958, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.90869140625, - "rewards/tag_count_reward/std": 0.21635127067565918, + "grad_norm": 4.370997905731201, + "kl": 1.849609375, + "learning_rate": 7.602934614848135e-07, + "loss": 0.0781, + "num_tokens": 770873716.0, + "reward": 1.0556640625, + "reward_std": 0.2359110713005066, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.1723288893699646, "step": 1204 }, { @@ -34931,27 +34931,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 717.984375, - "completions/mean_terminated_length": 691.4900512695312, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 1785.0, + "completions/mean_length": 661.126953125, + "completions/mean_terminated_length": 658.4129028320312, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.41136809763591364, - "grad_norm": 2.9843039512634277, - "kl": 4.9140625, - "learning_rate": 7.595087421027767e-07, - "loss": 0.3451, - "num_tokens": 696845227.0, - "reward": 1.87109375, - "reward_std": 0.5197564363479614, - "rewards/accuracy_reward/mean": 0.07459677755832672, - "rewards/accuracy_reward/std": 0.263004869222641, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.1863754689693451, + "grad_norm": 1.6642152070999146, + "kl": 2.88671875, + "learning_rate": 7.598191653582505e-07, + "loss": 0.158, + "num_tokens": 771290517.0, + "reward": 1.0361328125, + "reward_std": 0.23599085211753845, + "rewards/accuracy_reward/mean": 0.11491935700178146, + "rewards/accuracy_reward/std": 0.3192466199398041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.18362505733966827, "step": 1205 }, { @@ -34960,27 +34960,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 796.98046875, - "completions/mean_terminated_length": 759.2233276367188, - "completions/min_length": 237.0, - "completions/min_terminated_length": 237.0, + "completions/max_terminated_length": 1818.0, + "completions/mean_length": 752.44921875, + "completions/mean_terminated_length": 747.36865234375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, "epoch": 0.41170948194930446, - "grad_norm": 4.940004348754883, - "kl": 5.8359375, - "learning_rate": 7.590341334657395e-07, - "loss": 0.4393, - "num_tokens": 697329937.0, - "reward": 1.87255859375, - "reward_std": 0.5778836011886597, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.180350661277771, + "grad_norm": 4.010087013244629, + "kl": 2.75, + "learning_rate": 7.593445712057676e-07, + "loss": 0.1477, + "num_tokens": 771752427.0, + "reward": 1.0302734375, + "reward_std": 0.2761410176753998, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.2001573145389557, "step": 1206 }, { @@ -34989,27 +34989,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1849.0, - "completions/mean_length": 856.20703125, - "completions/mean_terminated_length": 802.6979370117188, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1864.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 769.9921875, + "completions/mean_terminated_length": 769.9921875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.4120508662626952, - "grad_norm": 0.8065212965011597, - "kl": 6.578125, - "learning_rate": 7.585592281429828e-07, - "loss": 0.3917, - "num_tokens": 697847083.0, - "reward": 1.7763671875, - "reward_std": 0.5780097842216492, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.21227891743183136, + "grad_norm": 2.0366873741149902, + "kl": 2.6171875, + "learning_rate": 7.588696797014755e-07, + "loss": 0.1405, + "num_tokens": 772225431.0, + "reward": 0.98974609375, + "reward_std": 0.26377055048942566, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.18892091512680054, "step": 1207 }, { @@ -35018,27 +35018,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1947.0, - "completions/mean_length": 801.88671875, - "completions/mean_terminated_length": 751.231689453125, - "completions/min_length": 29.0, - "completions/min_terminated_length": 29.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 710.583984375, + "completions/mean_terminated_length": 702.701416015625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, "epoch": 0.41239225057608603, - "grad_norm": 1.0168187618255615, - "kl": 5.6796875, - "learning_rate": 7.580840268085477e-07, - "loss": 0.3418, - "num_tokens": 698337825.0, - "reward": 1.84130859375, - "reward_std": 0.5921512842178345, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.2089642435312271, + "grad_norm": 3.1804232597351074, + "kl": 4.5, + "learning_rate": 7.583944915199073e-07, + "loss": 0.2368, + "num_tokens": 772669426.0, + "reward": 1.0380859375, + "reward_std": 0.2954888641834259, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.1984890252351761, "step": 1208 }, { @@ -35047,27 +35047,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 880.62109375, - "completions/mean_terminated_length": 825.7136840820312, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 770.615234375, + "completions/mean_terminated_length": 768.115478515625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.41273363488947684, - "grad_norm": 1.1118146181106567, - "kl": 8.015625, - "learning_rate": 7.576085301368955e-07, - "loss": 0.4832, - "num_tokens": 698865359.0, - "reward": 1.73193359375, - "reward_std": 0.6110692024230957, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.79296875, - "rewards/format_reward/std": 0.40557438135147095, - "rewards/tag_count_reward/mean": 0.89404296875, - "rewards/tag_count_reward/std": 0.23278005421161652, + "grad_norm": 2.5155560970306396, + "kl": 3.7421875, + "learning_rate": 7.579190073360175e-07, + "loss": 0.1608, + "num_tokens": 773140637.0, + "reward": 0.95703125, + "reward_std": 0.27758684754371643, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.900390625, + "rewards/tag_count_reward/std": 0.21004575490951538, "step": 1209 }, { @@ -35076,27 +35076,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 883.099609375, - "completions/mean_terminated_length": 818.24951171875, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1945.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 729.69140625, + "completions/mean_terminated_length": 729.69140625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, "epoch": 0.41307501920286765, - "grad_norm": 3.1092116832733154, - "kl": 8.0234375, - "learning_rate": 7.571327388029071e-07, - "loss": 0.4615, - "num_tokens": 699387554.0, - "reward": 1.7412109375, - "reward_std": 0.6016433238983154, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.794921875, - "rewards/format_reward/std": 0.4041535556316376, - "rewards/tag_count_reward/mean": 0.8955078125, - "rewards/tag_count_reward/std": 0.22733746469020844, + "grad_norm": 4.559638977050781, + "kl": 3.3125, + "learning_rate": 7.574432278251813e-07, + "loss": 0.1399, + "num_tokens": 773584287.0, + "reward": 0.9873046875, + "reward_std": 0.24846667051315308, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.18347929418087006, "step": 1210 }, { @@ -35105,27 +35105,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 806.70703125, - "completions/mean_terminated_length": 753.6171264648438, + "completions/max_terminated_length": 1890.0, + "completions/mean_length": 707.55859375, + "completions/mean_terminated_length": 704.9354248046875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.4134164035162584, - "grad_norm": 1.6269659996032715, - "kl": 6.5859375, - "learning_rate": 7.566566534818809e-07, - "loss": 0.4204, - "num_tokens": 699887196.0, - "reward": 1.73388671875, - "reward_std": 0.6248112916946411, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.775390625, - "rewards/format_reward/std": 0.41773295402526855, - "rewards/tag_count_reward/mean": 0.89990234375, - "rewards/tag_count_reward/std": 0.21471090614795685, + "grad_norm": 5.833067417144775, + "kl": 4.28125, + "learning_rate": 7.569671536631928e-07, + "loss": 0.2186, + "num_tokens": 774033165.0, + "reward": 0.970703125, + "reward_std": 0.26562875509262085, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.21469810605049133, "step": 1211 }, { @@ -35134,56 +35134,56 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 829.37890625, - "completions/mean_terminated_length": 779.8414306640625, - "completions/min_length": 179.0, - "completions/min_terminated_length": 179.0, + "completions/max_terminated_length": 1912.0, + "completions/mean_length": 716.3515625, + "completions/mean_terminated_length": 713.74560546875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.4137577878296492, - "grad_norm": 1.2652716636657715, - "kl": 6.37109375, - "learning_rate": 7.561802748495332e-07, - "loss": 0.4199, - "num_tokens": 700390206.0, - "reward": 1.81884765625, - "reward_std": 0.6061054468154907, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.20499862730503082, + "grad_norm": 3.1062331199645996, + "kl": 3.93359375, + "learning_rate": 7.564907855262652e-07, + "loss": 0.2344, + "num_tokens": 774478305.0, + "reward": 0.9931640625, + "reward_std": 0.2758350968360901, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.20556476712226868, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 827.82421875, - "completions/mean_terminated_length": 778.2235717773438, - "completions/min_length": 217.0, - "completions/min_terminated_length": 217.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1821.0, + "completions/max_terminated_length": 1821.0, + "completions/mean_length": 676.357421875, + "completions/mean_terminated_length": 676.357421875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.41409917214304004, - "grad_norm": 1.3812679052352905, - "kl": 7.4765625, - "learning_rate": 7.557036035819963e-07, - "loss": 0.4555, - "num_tokens": 700891732.0, - "reward": 1.73193359375, - "reward_std": 0.6351964473724365, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.7734375, - "rewards/format_reward/std": 0.4190165400505066, - "rewards/tag_count_reward/mean": 0.89013671875, - "rewards/tag_count_reward/std": 0.23359142243862152, + "grad_norm": 2.1832549571990967, + "kl": 3.388671875, + "learning_rate": 7.560141240910292e-07, + "loss": 0.1532, + "num_tokens": 774902280.0, + "reward": 0.9775390625, + "reward_std": 0.2534903883934021, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.20254908502101898, "step": 1213 }, { @@ -35192,27 +35192,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 819.130859375, - "completions/mean_terminated_length": 797.1431274414062, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1715.0, + "completions/max_terminated_length": 1715.0, + "completions/mean_length": 699.9609375, + "completions/mean_terminated_length": 699.9609375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.41444055645643085, - "grad_norm": 1.9653494358062744, - "kl": 4.7265625, - "learning_rate": 7.552266403558176e-07, - "loss": 0.2734, - "num_tokens": 701386487.0, - "reward": 1.75830078125, - "reward_std": 0.5999512672424316, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.21380557119846344, + "grad_norm": 1.6816651821136475, + "kl": 3.021484375, + "learning_rate": 7.555371700345314e-07, + "loss": 0.1595, + "num_tokens": 775336020.0, + "reward": 0.984375, + "reward_std": 0.23822785913944244, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.18783605098724365, "step": 1214 }, { @@ -35221,27 +35221,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 822.66796875, - "completions/mean_terminated_length": 754.45361328125, - "completions/min_length": 212.0, - "completions/min_terminated_length": 212.0, + "completions/max_terminated_length": 1478.0, + "completions/mean_length": 635.57421875, + "completions/mean_terminated_length": 632.8101806640625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, "epoch": 0.4147819407698216, - "grad_norm": 1.7280266284942627, - "kl": 8.40625, - "learning_rate": 7.547493858479595e-07, - "loss": 0.5356, - "num_tokens": 701879565.0, - "reward": 1.724609375, - "reward_std": 0.7054948806762695, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.740234375, - "rewards/format_reward/std": 0.4389347732067108, - "rewards/tag_count_reward/mean": 0.873046875, - "rewards/tag_count_reward/std": 0.2392423003911972, + "grad_norm": 2.663269519805908, + "kl": 3.0, + "learning_rate": 7.550599240342348e-07, + "loss": 0.1569, + "num_tokens": 775733306.0, + "reward": 1.06298828125, + "reward_std": 0.3054298758506775, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91064453125, + "rewards/tag_count_reward/std": 0.19832709431648254, "step": 1215 }, { @@ -35250,27 +35250,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 797.548828125, - "completions/mean_terminated_length": 736.0512084960938, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1586.0, + "completions/max_terminated_length": 1586.0, + "completions/mean_length": 664.2578125, + "completions/mean_terminated_length": 664.2578125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.4151233250832124, - "grad_norm": 2.610924482345581, - "kl": 6.3515625, - "learning_rate": 7.542718407357973e-07, - "loss": 0.4493, - "num_tokens": 702366438.0, - "reward": 1.7490234375, - "reward_std": 0.6174426078796387, - "rewards/accuracy_reward/mean": 0.06451612710952759, - "rewards/accuracy_reward/std": 0.2459181249141693, - "rewards/format_reward/mean": 0.7890625, - "rewards/format_reward/std": 0.4083731174468994, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.2228032499551773, + "grad_norm": 3.425699472427368, + "kl": 2.78125, + "learning_rate": 7.545823867680172e-07, + "loss": 0.1455, + "num_tokens": 776151934.0, + "reward": 1.0126953125, + "reward_std": 0.2261040210723877, + "rewards/accuracy_reward/mean": 0.08669354766607285, + "rewards/accuracy_reward/std": 0.281669557094574, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.183853879570961, "step": 1216 }, { @@ -35279,27 +35279,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 823.921875, - "completions/mean_terminated_length": 784.4354858398438, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1526.0, + "completions/max_terminated_length": 1526.0, + "completions/mean_length": 670.31640625, + "completions/mean_terminated_length": 670.31640625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.41546470939660324, - "grad_norm": 1.7523225545883179, - "kl": 6.328125, - "learning_rate": 7.537940056971192e-07, - "loss": 0.3804, - "num_tokens": 702862318.0, - "reward": 1.69921875, - "reward_std": 0.6397418975830078, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.767578125, - "rewards/format_reward/std": 0.42278963327407837, - "rewards/tag_count_reward/mean": 0.884765625, - "rewards/tag_count_reward/std": 0.22697781026363373, + "grad_norm": 2.243535280227661, + "kl": 3.03515625, + "learning_rate": 7.54104558914169e-07, + "loss": 0.1405, + "num_tokens": 776569168.0, + "reward": 0.9697265625, + "reward_std": 0.24969938397407532, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.19095149636268616, "step": 1217 }, { @@ -35308,27 +35308,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 755.7734375, - "completions/mean_terminated_length": 700.505126953125, - "completions/min_length": 76.0, - "completions/min_terminated_length": 76.0, + "completions/max_terminated_length": 1498.0, + "completions/mean_length": 601.0546875, + "completions/mean_terminated_length": 598.2230834960938, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, "epoch": 0.41580609370999405, - "grad_norm": 1.3652105331420898, - "kl": 6.296875, - "learning_rate": 7.533158814101242e-07, - "loss": 0.4089, - "num_tokens": 703334346.0, - "reward": 1.76318359375, - "reward_std": 0.6007625460624695, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.765625, - "rewards/format_reward/std": 0.42402184009552, - "rewards/tag_count_reward/mean": 0.89990234375, - "rewards/tag_count_reward/std": 0.21299511194229126, + "grad_norm": 4.377151012420654, + "kl": 4.1015625, + "learning_rate": 7.536264411513948e-07, + "loss": 0.2096, + "num_tokens": 776961980.0, + "reward": 1.0029296875, + "reward_std": 0.28715670108795166, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.19600872695446014, "step": 1218 }, { @@ -35337,27 +35337,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 876.01171875, - "completions/mean_terminated_length": 797.8792114257812, - "completions/min_length": 17.0, - "completions/min_terminated_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1682.0, + "completions/max_terminated_length": 1682.0, + "completions/mean_length": 681.7734375, + "completions/mean_terminated_length": 681.7734375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.4161474780233848, - "grad_norm": 1.6425671577453613, - "kl": 8.3046875, - "learning_rate": 7.528374685534227e-07, - "loss": 0.5534, - "num_tokens": 703868752.0, - "reward": 1.6044921875, - "reward_std": 0.7019073367118835, - "rewards/accuracy_reward/mean": 0.05645161122083664, - "rewards/accuracy_reward/std": 0.23102475702762604, - "rewards/format_reward/mean": 0.69921875, - "rewards/format_reward/std": 0.45904624462127686, - "rewards/tag_count_reward/mean": 0.8505859375, - "rewards/tag_count_reward/std": 0.2596279978752136, + "grad_norm": 1.8211148977279663, + "kl": 2.826171875, + "learning_rate": 7.531480341588101e-07, + "loss": 0.1257, + "num_tokens": 777396936.0, + "reward": 0.9638671875, + "reward_std": 0.2702760696411133, + "rewards/accuracy_reward/mean": 0.058467742055654526, + "rewards/accuracy_reward/std": 0.23486268520355225, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.20734204351902008, "step": 1219 }, { @@ -35366,27 +35366,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 864.052734375, - "completions/mean_terminated_length": 795.5598754882812, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/max_terminated_length": 1507.0, + "completions/mean_length": 701.68359375, + "completions/mean_terminated_length": 699.0488891601562, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, "epoch": 0.4164888623367756, - "grad_norm": 1.81349778175354, - "kl": 6.1953125, - "learning_rate": 7.52358767806034e-07, - "loss": 0.4059, - "num_tokens": 704389803.0, - "reward": 1.75927734375, - "reward_std": 0.6504988670349121, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.77734375, - "rewards/format_reward/std": 0.41643625497817993, - "rewards/tag_count_reward/mean": 0.88623046875, - "rewards/tag_count_reward/std": 0.23118187487125397, + "grad_norm": 5.5619683265686035, + "kl": 3.55078125, + "learning_rate": 7.526693386159411e-07, + "loss": 0.1408, + "num_tokens": 777834854.0, + "reward": 1.0185546875, + "reward_std": 0.3216370940208435, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9013671875, + "rewards/tag_count_reward/std": 0.21339233219623566, "step": 1220 }, { @@ -35395,27 +35395,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1935.0, - "completions/mean_length": 879.876953125, - "completions/mean_terminated_length": 807.1722412109375, - "completions/min_length": 204.0, - "completions/min_terminated_length": 204.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1844.0, + "completions/max_terminated_length": 1844.0, + "completions/mean_length": 702.060546875, + "completions/mean_terminated_length": 702.060546875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, "epoch": 0.41683024665016644, - "grad_norm": 1.5401265621185303, - "kl": 6.890625, - "learning_rate": 7.518797798473865e-07, - "loss": 0.4578, - "num_tokens": 704917340.0, - "reward": 1.7490234375, - "reward_std": 0.6217037439346313, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.7734375, - "rewards/format_reward/std": 0.4190165400505066, - "rewards/tag_count_reward/mean": 0.8876953125, - "rewards/tag_count_reward/std": 0.23110590875148773, + "grad_norm": 2.8141682147979736, + "kl": 3.26171875, + "learning_rate": 7.521903552027246e-07, + "loss": 0.1162, + "num_tokens": 778271349.0, + "reward": 0.96044921875, + "reward_std": 0.27343103289604187, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89013671875, + "rewards/tag_count_reward/std": 0.21562129259109497, "step": 1221 }, { @@ -35424,27 +35424,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 832.552734375, - "completions/mean_terminated_length": 775.3844604492188, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 1760.0, + "completions/mean_length": 724.310546875, + "completions/mean_terminated_length": 721.7201538085938, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, "epoch": 0.41717163096355725, - "grad_norm": 2.129183292388916, - "kl": 4.94921875, - "learning_rate": 7.514005053573156e-07, - "loss": 0.3138, - "num_tokens": 705418391.0, - "reward": 1.8369140625, - "reward_std": 0.5654664039611816, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.20277541875839233, + "grad_norm": 2.196227788925171, + "kl": 3.0859375, + "learning_rate": 7.517110845995055e-07, + "loss": 0.1794, + "num_tokens": 778716980.0, + "reward": 1.00830078125, + "reward_std": 0.2872357666492462, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90283203125, + "rewards/tag_count_reward/std": 0.20018774271011353, "step": 1222 }, { @@ -35453,27 +35453,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 780.529296875, - "completions/mean_terminated_length": 737.0000610351562, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1407.0, + "completions/max_terminated_length": 1407.0, + "completions/mean_length": 668.353515625, + "completions/mean_terminated_length": 668.353515625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.417513015276948, - "grad_norm": 2.336930990219116, - "kl": 4.640625, - "learning_rate": 7.509209450160639e-07, - "loss": 0.3067, - "num_tokens": 705897702.0, - "reward": 1.80859375, - "reward_std": 0.5186939239501953, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.19597215950489044, + "grad_norm": 2.449051856994629, + "kl": 2.85546875, + "learning_rate": 7.512315274870371e-07, + "loss": 0.1275, + "num_tokens": 779138857.0, + "reward": 0.9970703125, + "reward_std": 0.251140296459198, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.19051063060760498, "step": 1223 }, { @@ -35482,27 +35482,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1942.0, - "completions/mean_length": 790.908203125, - "completions/mean_terminated_length": 745.103271484375, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1727.0, + "completions/max_terminated_length": 1727.0, + "completions/mean_length": 693.0859375, + "completions/mean_terminated_length": 693.0859375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.4178543995903388, - "grad_norm": 3.638871908187866, - "kl": 3.94921875, - "learning_rate": 7.5044109950428e-07, - "loss": 0.3141, - "num_tokens": 706375447.0, - "reward": 1.8720703125, - "reward_std": 0.4504891335964203, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.1758417934179306, + "grad_norm": 2.5035128593444824, + "kl": 2.2265625, + "learning_rate": 7.507516845464797e-07, + "loss": 0.1043, + "num_tokens": 779566517.0, + "reward": 1.00390625, + "reward_std": 0.203842431306839, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.16318395733833313, "step": 1224 }, { @@ -35511,27 +35511,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1884.0, - "completions/mean_length": 725.037109375, - "completions/mean_terminated_length": 687.8453369140625, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 1440.0, + "completions/mean_length": 648.873046875, + "completions/mean_terminated_length": 646.135009765625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, "epoch": 0.41819578390372963, - "grad_norm": 2.105980634689331, - "kl": 3.8203125, - "learning_rate": 7.499609695030163e-07, - "loss": 0.2773, - "num_tokens": 706821706.0, - "reward": 1.90087890625, - "reward_std": 0.4926224946975708, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.94384765625, - "rewards/tag_count_reward/std": 0.1674204021692276, + "grad_norm": 2.15484619140625, + "kl": 3.29296875, + "learning_rate": 7.502715564593991e-07, + "loss": 0.172, + "num_tokens": 779973780.0, + "reward": 1.01123046875, + "reward_std": 0.29037126898765564, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.21251004934310913, "step": 1225 }, { @@ -35540,27 +35540,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1997.0, - "completions/mean_length": 788.541015625, - "completions/mean_terminated_length": 745.2869262695312, - "completions/min_length": 205.0, - "completions/min_terminated_length": 205.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1650.0, + "completions/max_terminated_length": 1650.0, + "completions/mean_length": 729.49609375, + "completions/mean_terminated_length": 729.49609375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.41853716821712045, - "grad_norm": 2.395641565322876, - "kl": 6.296875, - "learning_rate": 7.494805556937299e-07, - "loss": 0.4447, - "num_tokens": 707303775.0, - "reward": 1.806640625, - "reward_std": 0.5870789289474487, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.912109375, - "rewards/tag_count_reward/std": 0.20946265757083893, + "grad_norm": 4.40711784362793, + "kl": 2.796875, + "learning_rate": 7.497911439077665e-07, + "loss": 0.1309, + "num_tokens": 780425618.0, + "reward": 0.9853515625, + "reward_std": 0.2513273358345032, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.1833125650882721, "step": 1226 }, { @@ -35569,27 +35569,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1978.0, - "completions/mean_length": 764.029296875, - "completions/mean_terminated_length": 746.231689453125, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 1744.0, + "completions/mean_length": 725.015625, + "completions/mean_terminated_length": 717.2180786132812, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.4188785525305112, - "grad_norm": 1.5705026388168335, - "kl": 6.0, - "learning_rate": 7.4899985875828e-07, - "loss": 0.3616, - "num_tokens": 707772318.0, - "reward": 1.841796875, - "reward_std": 0.5263835191726685, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.18759170174598694, + "grad_norm": 2.1675784587860107, + "kl": 2.79296875, + "learning_rate": 7.493104475739574e-07, + "loss": 0.121, + "num_tokens": 780874186.0, + "reward": 1.03125, + "reward_std": 0.2659454643726349, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.17952290177345276, "step": 1227 }, { @@ -35598,27 +35598,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1880.0, - "completions/mean_length": 753.322265625, - "completions/mean_terminated_length": 722.2500610351562, - "completions/min_length": 45.0, - "completions/min_terminated_length": 45.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1692.0, + "completions/max_terminated_length": 1692.0, + "completions/mean_length": 706.96875, + "completions/mean_terminated_length": 706.96875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.419219936843902, - "grad_norm": 1.6948952674865723, - "kl": 5.75, - "learning_rate": 7.485188793789284e-07, - "loss": 0.3697, - "num_tokens": 708240227.0, - "reward": 1.84130859375, - "reward_std": 0.5240170955657959, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.1837882399559021, + "grad_norm": 2.6838104724884033, + "kl": 3.53515625, + "learning_rate": 7.488294681407498e-07, + "loss": 0.1447, + "num_tokens": 781318362.0, + "reward": 1.02099609375, + "reward_std": 0.3166106939315796, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91162109375, + "rewards/tag_count_reward/std": 0.19876503944396973, "step": 1228 }, { @@ -35627,27 +35627,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 798.45703125, - "completions/mean_terminated_length": 731.6090087890625, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1610.0, + "completions/max_terminated_length": 1610.0, + "completions/mean_length": 713.36328125, + "completions/mean_terminated_length": 713.36328125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, "epoch": 0.41956132115729283, - "grad_norm": 7.500605583190918, - "kl": 10.9375, - "learning_rate": 7.480376182383371e-07, - "loss": 0.6152, - "num_tokens": 708728573.0, - "reward": 1.78076171875, - "reward_std": 0.658714771270752, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.89208984375, - "rewards/tag_count_reward/std": 0.22491775453090668, + "grad_norm": 5.827304363250732, + "kl": 4.6640625, + "learning_rate": 7.48348206291324e-07, + "loss": 0.1966, + "num_tokens": 781763140.0, + "reward": 1.02099609375, + "reward_std": 0.31554529070854187, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89794921875, + "rewards/tag_count_reward/std": 0.21032704412937164, "step": 1229 }, { @@ -35656,27 +35656,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 728.208984375, - "completions/mean_terminated_length": 688.376220703125, - "completions/min_length": 213.0, - "completions/min_terminated_length": 213.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 695.64453125, + "completions/mean_terminated_length": 692.998046875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.41990270547068365, - "grad_norm": 6.722501754760742, - "kl": 8.640625, - "learning_rate": 7.47556076019568e-07, - "loss": 0.4902, - "num_tokens": 709182520.0, - "reward": 1.830078125, - "reward_std": 0.5410079956054688, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.1937359869480133, + "grad_norm": 1.852268934249878, + "kl": 3.27734375, + "learning_rate": 7.478666627092618e-07, + "loss": 0.1312, + "num_tokens": 782200414.0, + "reward": 1.0224609375, + "reward_std": 0.2842390835285187, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.18948465585708618, "step": 1230 }, { @@ -35685,27 +35685,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 752.740234375, - "completions/mean_terminated_length": 708.256591796875, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1801.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 726.1328125, + "completions/mean_terminated_length": 726.1328125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.4202440897840744, - "grad_norm": 5.435886859893799, - "kl": 9.375, - "learning_rate": 7.470742534060827e-07, - "loss": 0.5561, - "num_tokens": 709643955.0, - "reward": 1.79248046875, - "reward_std": 0.6186740398406982, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.20516635477542877, + "grad_norm": 2.6435227394104004, + "kl": 3.3671875, + "learning_rate": 7.473848380785448e-07, + "loss": 0.1375, + "num_tokens": 782648226.0, + "reward": 1.03173828125, + "reward_std": 0.27165985107421875, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91064453125, + "rewards/tag_count_reward/std": 0.19832709431648254, "step": 1231 }, { @@ -35714,27 +35714,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 756.39453125, - "completions/mean_terminated_length": 712.036376953125, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1672.0, + "completions/max_terminated_length": 1672.0, + "completions/mean_length": 727.642578125, + "completions/mean_terminated_length": 727.642578125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.4205854740974652, - "grad_norm": 6.821199893951416, - "kl": 8.703125, - "learning_rate": 7.465921510817401e-07, - "loss": 0.5082, - "num_tokens": 710121453.0, - "reward": 1.8095703125, - "reward_std": 0.5002506971359253, - "rewards/accuracy_reward/mean": 0.06653226166963577, - "rewards/accuracy_reward/std": 0.2494617998600006, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.20379072427749634, + "grad_norm": 3.1454718112945557, + "kl": 3.6484375, + "learning_rate": 7.469027330835536e-07, + "loss": 0.1456, + "num_tokens": 783111003.0, + "reward": 1.00732421875, + "reward_std": 0.28986674547195435, + "rewards/accuracy_reward/mean": 0.10080645233392715, + "rewards/accuracy_reward/std": 0.30137622356414795, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.2009500116109848, "step": 1232 }, { @@ -35743,27 +35743,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 776.669921875, - "completions/mean_terminated_length": 724.9898071289062, - "completions/min_length": 208.0, - "completions/min_terminated_length": 208.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2021.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 767.248046875, + "completions/mean_terminated_length": 767.248046875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.42092685841085603, - "grad_norm": 7.6722822189331055, - "kl": 10.03125, - "learning_rate": 7.461097697307962e-07, - "loss": 0.573, - "num_tokens": 710595492.0, - "reward": 1.73291015625, - "reward_std": 0.6036777496337891, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.89697265625, - "rewards/tag_count_reward/std": 0.22064577043056488, + "grad_norm": 2.5196471214294434, + "kl": 3.328125, + "learning_rate": 7.464203484090679e-07, + "loss": 0.127, + "num_tokens": 783580218.0, + "reward": 0.9794921875, + "reward_std": 0.29774603247642517, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.21025264263153076, "step": 1233 }, { @@ -35772,27 +35772,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 759.509765625, - "completions/mean_terminated_length": 707.132080078125, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_terminated_length": 1672.0, + "completions/mean_length": 738.099609375, + "completions/mean_terminated_length": 735.5361938476562, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.42126824272424684, - "grad_norm": 1.8243650197982788, - "kl": 7.5546875, - "learning_rate": 7.456271100379031e-07, - "loss": 0.5141, - "num_tokens": 711057993.0, - "reward": 1.79345703125, - "reward_std": 0.573222279548645, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.20922017097473145, + "grad_norm": 1.862230658531189, + "kl": 3.16796875, + "learning_rate": 7.459376847402637e-07, + "loss": 0.1414, + "num_tokens": 784031757.0, + "reward": 0.9765625, + "reward_std": 0.2748311758041382, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.19689640402793884, "step": 1234 }, { @@ -35801,27 +35801,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1812.0, - "completions/mean_length": 745.484375, - "completions/mean_terminated_length": 687.0040283203125, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_terminated_length": 1786.0, + "completions/mean_length": 707.779296875, + "completions/mean_terminated_length": 705.1565551757812, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, "epoch": 0.4216096270376376, - "grad_norm": 1.6971056461334229, - "kl": 7.0234375, - "learning_rate": 7.451441726881082e-07, - "loss": 0.4601, - "num_tokens": 711523889.0, - "reward": 1.77783203125, - "reward_std": 0.5547708868980408, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.21192020177841187, + "grad_norm": 1.280155062675476, + "kl": 2.275390625, + "learning_rate": 7.454547427627136e-07, + "loss": 0.0877, + "num_tokens": 784478348.0, + "reward": 0.99169921875, + "reward_std": 0.23311598598957062, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.1742224544286728, "step": 1235 }, { @@ -35830,27 +35830,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 818.845703125, - "completions/mean_terminated_length": 766.2749633789062, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2030.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 806.701171875, + "completions/mean_terminated_length": 806.701171875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, "epoch": 0.4219510113510284, - "grad_norm": 1.6898390054702759, - "kl": 6.1875, - "learning_rate": 7.446609583668522e-07, - "loss": 0.4155, - "num_tokens": 712037026.0, - "reward": 1.82666015625, - "reward_std": 0.6324109435081482, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.2115049660205841, + "grad_norm": 2.0891847610473633, + "kl": 2.15234375, + "learning_rate": 7.449715231623857e-07, + "loss": 0.1038, + "num_tokens": 784985267.0, + "reward": 1.07177734375, + "reward_std": 0.3115023374557495, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.19664327800273895, "step": 1236 }, { @@ -35859,27 +35859,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1965.0, - "completions/mean_length": 725.25390625, - "completions/mean_terminated_length": 688.0682373046875, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1628.0, + "completions/max_terminated_length": 1628.0, + "completions/mean_length": 717.900390625, + "completions/mean_terminated_length": 717.900390625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.42229239566441923, - "grad_norm": 1.8300591707229614, - "kl": 5.79296875, - "learning_rate": 7.441774677599699e-07, - "loss": 0.3901, - "num_tokens": 712485332.0, - "reward": 1.84521484375, - "reward_std": 0.5363088846206665, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.18577399849891663, + "grad_norm": 3.186643123626709, + "kl": 1.986328125, + "learning_rate": 7.444880266256425e-07, + "loss": 0.073, + "num_tokens": 785429808.0, + "reward": 1.0400390625, + "reward_std": 0.2679649293422699, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.18117332458496094, "step": 1237 }, { @@ -35888,27 +35888,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1907.0, - "completions/mean_length": 766.623046875, - "completions/mean_terminated_length": 700.843994140625, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 1809.0, + "completions/mean_length": 764.6015625, + "completions/mean_terminated_length": 762.0900268554688, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.42263377997781004, - "grad_norm": 2.1360459327697754, - "kl": 6.0234375, - "learning_rate": 7.436937015536876e-07, - "loss": 0.3957, - "num_tokens": 712956419.0, - "reward": 1.7958984375, - "reward_std": 0.573586106300354, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, + "grad_norm": 1.5022865533828735, + "kl": 2.333984375, + "learning_rate": 7.440042538392393e-07, + "loss": 0.0902, + "num_tokens": 785899860.0, + "reward": 1.0107421875, + "reward_std": 0.27494916319847107, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.20617742836475372, + "rewards/tag_count_reward/std": 0.19645671546459198, "step": 1238 }, { @@ -35917,27 +35917,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 746.923828125, - "completions/mean_terminated_length": 707.6558837890625, - "completions/min_length": 34.0, - "completions/min_terminated_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2021.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 751.791015625, + "completions/mean_terminated_length": 751.791015625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.4229751642912008, - "grad_norm": 3.2511074542999268, - "kl": 5.9609375, - "learning_rate": 7.432096604346231e-07, - "loss": 0.4301, - "num_tokens": 713414972.0, - "reward": 1.82470703125, - "reward_std": 0.5387951135635376, - "rewards/accuracy_reward/mean": 0.05443548411130905, - "rewards/accuracy_reward/std": 0.227104052901268, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.18481481075286865, + "grad_norm": 5.304492950439453, + "kl": 2.1953125, + "learning_rate": 7.435202054903244e-07, + "loss": 0.105, + "num_tokens": 786360905.0, + "reward": 0.99560546875, + "reward_std": 0.2517945468425751, + "rewards/accuracy_reward/mean": 0.07661290466785431, + "rewards/accuracy_reward/std": 0.2662447690963745, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.1897486001253128, "step": 1239 }, { @@ -35946,27 +35946,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 769.263671875, - "completions/mean_terminated_length": 738.5740356445312, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1920.0, + "completions/max_terminated_length": 1920.0, + "completions/mean_length": 741.369140625, + "completions/mean_terminated_length": 741.369140625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.4233165486045916, - "grad_norm": 1.5109126567840576, - "kl": 4.640625, - "learning_rate": 7.427253450897844e-07, - "loss": 0.2805, - "num_tokens": 713888643.0, - "reward": 1.78271484375, - "reward_std": 0.5137543082237244, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.18625685572624207, + "grad_norm": 2.2809250354766846, + "kl": 2.19921875, + "learning_rate": 7.430358822664371e-07, + "loss": 0.1142, + "num_tokens": 786820294.0, + "reward": 0.98779296875, + "reward_std": 0.2326989322900772, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.18145988881587982, "step": 1240 }, { @@ -35975,27 +35975,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1735.0, - "completions/mean_length": 800.2421875, - "completions/mean_terminated_length": 749.5203247070312, - "completions/min_length": 224.0, - "completions/min_terminated_length": 224.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1903.0, + "completions/max_terminated_length": 1903.0, + "completions/mean_length": 791.037109375, + "completions/mean_terminated_length": 791.037109375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.4236579329179824, - "grad_norm": 1.2704123258590698, - "kl": 6.0859375, - "learning_rate": 7.422407562065678e-07, - "loss": 0.3967, - "num_tokens": 714374799.0, - "reward": 1.85009765625, - "reward_std": 0.5252021551132202, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, + "grad_norm": 1.7501929998397827, + "kl": 2.14453125, + "learning_rate": 7.425512848555073e-07, + "loss": 0.0756, + "num_tokens": 787301737.0, + "reward": 1.03369140625, + "reward_std": 0.26344534754753113, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.19307827949523926, + "rewards/tag_count_reward/std": 0.17090001702308655, "step": 1241 }, { @@ -36004,27 +36004,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 819.794921875, - "completions/mean_terminated_length": 767.2648315429688, - "completions/min_length": 206.0, - "completions/min_terminated_length": 206.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1855.0, + "completions/max_terminated_length": 1855.0, + "completions/mean_length": 790.51953125, + "completions/mean_terminated_length": 790.51953125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, "epoch": 0.42399931723137324, - "grad_norm": 2.8821945190429688, - "kl": 8.484375, - "learning_rate": 7.417558944727593e-07, - "loss": 0.5146, - "num_tokens": 714881830.0, - "reward": 1.75341796875, - "reward_std": 0.602732241153717, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.779296875, - "rewards/format_reward/std": 0.4151262938976288, - "rewards/tag_count_reward/mean": 0.90380859375, - "rewards/tag_count_reward/std": 0.20725619792938232, + "grad_norm": 2.1573097705841064, + "kl": 2.580078125, + "learning_rate": 7.420664139458546e-07, + "loss": 0.1082, + "num_tokens": 787793779.0, + "reward": 0.990234375, + "reward_std": 0.29185423254966736, + "rewards/accuracy_reward/mean": 0.0786290317773819, + "rewards/accuracy_reward/std": 0.26943066716194153, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.19805769622325897, "step": 1242 }, { @@ -36033,27 +36033,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 772.87890625, - "completions/mean_terminated_length": 721.044677734375, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1810.0, + "completions/max_terminated_length": 1810.0, + "completions/mean_length": 701.197265625, + "completions/mean_terminated_length": 701.197265625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.424340701544764, - "grad_norm": 5.812600612640381, - "kl": 10.8125, - "learning_rate": 7.412707605765313e-07, - "loss": 0.6363, - "num_tokens": 715354312.0, - "reward": 1.69921875, - "reward_std": 0.6376601457595825, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.76953125, - "rewards/format_reward/std": 0.42154473066329956, - "rewards/tag_count_reward/mean": 0.892578125, - "rewards/tag_count_reward/std": 0.22812001407146454, + "grad_norm": 2.4701790809631348, + "kl": 2.51171875, + "learning_rate": 7.415812702261864e-07, + "loss": 0.0973, + "num_tokens": 788229560.0, + "reward": 0.98193359375, + "reward_std": 0.19726133346557617, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.16570508480072021, "step": 1243 }, { @@ -36062,27 +36062,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 793.798828125, - "completions/mean_terminated_length": 763.6980590820312, - "completions/min_length": 217.0, - "completions/min_terminated_length": 217.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 812.02734375, + "completions/mean_terminated_length": 809.6085815429688, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.4246820858581548, - "grad_norm": 3.165933609008789, - "kl": 7.3671875, - "learning_rate": 7.407853552064425e-07, - "loss": 0.4275, - "num_tokens": 715839825.0, - "reward": 1.83447265625, - "reward_std": 0.575330376625061, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.20025932788848877, + "grad_norm": 1.3704205751419067, + "kl": 2.5546875, + "learning_rate": 7.410958543855983e-07, + "loss": 0.1239, + "num_tokens": 788724406.0, + "reward": 1.0400390625, + "reward_std": 0.2781640291213989, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.19119153916835785, "step": 1244 }, { @@ -36091,27 +36091,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 780.966796875, - "completions/mean_terminated_length": 747.9579467773438, - "completions/min_length": 205.0, - "completions/min_terminated_length": 205.0, + "completions/max_terminated_length": 1942.0, + "completions/mean_length": 738.068359375, + "completions/mean_terminated_length": 732.931396484375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, "epoch": 0.4250234701715456, - "grad_norm": 1.9695323705673218, - "kl": 7.546875, - "learning_rate": 7.40299679051437e-07, - "loss": 0.46, - "num_tokens": 716316496.0, - "reward": 1.76806640625, - "reward_std": 0.5813748836517334, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.19820177555084229, + "grad_norm": 3.5096609592437744, + "kl": 3.470703125, + "learning_rate": 7.406101671135721e-07, + "loss": 0.1393, + "num_tokens": 789179113.0, + "reward": 0.99169921875, + "reward_std": 0.25856339931488037, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.1951066106557846, "step": 1245 }, { @@ -36120,27 +36120,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 827.5390625, - "completions/mean_terminated_length": 759.5958862304688, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 1627.0, + "completions/mean_length": 749.0625, + "completions/mean_terminated_length": 746.5205688476562, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.42536485448493644, - "grad_norm": 4.001309394836426, - "kl": 9.09375, - "learning_rate": 7.398137328008435e-07, - "loss": 0.521, - "num_tokens": 716828212.0, - "reward": 1.712890625, - "reward_std": 0.6477146744728088, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.76171875, - "rewards/format_reward/std": 0.42644867300987244, - "rewards/tag_count_reward/mean": 0.890625, - "rewards/tag_count_reward/std": 0.22879758477210999, + "grad_norm": 2.27182674407959, + "kl": 3.5078125, + "learning_rate": 7.40124209099975e-07, + "loss": 0.1474, + "num_tokens": 789650649.0, + "reward": 0.9833984375, + "reward_std": 0.260844886302948, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9052734375, + "rewards/tag_count_reward/std": 0.20467032492160797, "step": 1246 }, { @@ -36149,27 +36149,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 797.966796875, - "completions/mean_terminated_length": 749.7910766601562, - "completions/min_length": 219.0, - "completions/min_terminated_length": 219.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 766.7265625, + "completions/mean_terminated_length": 759.1748657226562, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.4257062387983272, - "grad_norm": 1.3594669103622437, - "kl": 6.45703125, - "learning_rate": 7.393275171443737e-07, - "loss": 0.3914, - "num_tokens": 717316627.0, - "reward": 1.77197265625, - "reward_std": 0.5782288312911987, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.77734375, - "rewards/format_reward/std": 0.41643625497817993, - "rewards/tag_count_reward/mean": 0.90283203125, - "rewards/tag_count_reward/std": 0.21263141930103302, + "grad_norm": 2.3704638481140137, + "kl": 3.69140625, + "learning_rate": 7.396379810350591e-07, + "loss": 0.19, + "num_tokens": 790123069.0, + "reward": 1.0078125, + "reward_std": 0.2714865505695343, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.904296875, + "rewards/tag_count_reward/std": 0.20659643411636353, "step": 1247 }, { @@ -36178,27 +36178,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1960.0, - "completions/mean_length": 825.515625, - "completions/mean_terminated_length": 762.759765625, - "completions/min_length": 53.0, - "completions/min_terminated_length": 53.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1753.0, + "completions/max_terminated_length": 1753.0, + "completions/mean_length": 736.0546875, + "completions/mean_terminated_length": 736.0546875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.426047623111718, - "grad_norm": 1.1935927867889404, - "kl": 6.5703125, - "learning_rate": 7.388410327721218e-07, - "loss": 0.4069, - "num_tokens": 717821659.0, - "reward": 1.7099609375, - "reward_std": 0.5673522353172302, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.775390625, - "rewards/format_reward/std": 0.41773295402526855, - "rewards/tag_count_reward/mean": 0.8935546875, - "rewards/tag_count_reward/std": 0.23229363560676575, + "grad_norm": 3.7653613090515137, + "kl": 2.69921875, + "learning_rate": 7.3915148360946e-07, + "loss": 0.1059, + "num_tokens": 790582297.0, + "reward": 1.00439453125, + "reward_std": 0.26716482639312744, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.91455078125, + "rewards/tag_count_reward/std": 0.19065289199352264, "step": 1248 }, { @@ -36207,27 +36207,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1921.0, - "completions/mean_length": 816.89453125, - "completions/mean_terminated_length": 772.0364379882812, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1935.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 747.12109375, + "completions/mean_terminated_length": 747.12109375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, "epoch": 0.4263890074251088, - "grad_norm": 1.9006518125534058, - "kl": 6.1875, - "learning_rate": 7.383542803745632e-07, - "loss": 0.3989, - "num_tokens": 718311525.0, - "reward": 1.759765625, - "reward_std": 0.6409410834312439, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.896484375, - "rewards/tag_count_reward/std": 0.22670821845531464, + "grad_norm": 3.4539995193481445, + "kl": 2.31640625, + "learning_rate": 7.386647175141955e-07, + "loss": 0.0822, + "num_tokens": 791036439.0, + "reward": 1.0341796875, + "reward_std": 0.2673919200897217, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.17627599835395813, "step": 1249 }, { @@ -36236,27 +36236,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 844.4765625, - "completions/mean_terminated_length": 785.286865234375, - "completions/min_length": 46.0, - "completions/min_terminated_length": 46.0, + "completions/max_terminated_length": 1841.0, + "completions/mean_length": 791.865234375, + "completions/mean_terminated_length": 789.4070434570312, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.42673039173849964, - "grad_norm": 0.9973497986793518, - "kl": 7.6171875, - "learning_rate": 7.378672606425542e-07, - "loss": 0.4726, - "num_tokens": 718816281.0, - "reward": 1.6767578125, - "reward_std": 0.683112382888794, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.740234375, - "rewards/format_reward/std": 0.4389347732067108, - "rewards/tag_count_reward/mean": 0.8642578125, - "rewards/tag_count_reward/std": 0.25485852360725403, + "grad_norm": 1.7899456024169922, + "kl": 2.18359375, + "learning_rate": 7.381776834406656e-07, + "loss": 0.1077, + "num_tokens": 791514258.0, + "reward": 1.0263671875, + "reward_std": 0.24448788166046143, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.17217355966567993, "step": 1250 }, { @@ -36265,27 +36265,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1970.0, - "completions/mean_length": 862.634765625, - "completions/mean_terminated_length": 796.6453857421875, - "completions/min_length": 240.0, - "completions/min_terminated_length": 240.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1845.0, + "completions/max_terminated_length": 1845.0, + "completions/mean_length": 811.34765625, + "completions/mean_terminated_length": 811.34765625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.4270717760518904, - "grad_norm": 2.722085475921631, - "kl": 7.15625, - "learning_rate": 7.373799742673301e-07, - "loss": 0.4986, - "num_tokens": 719329614.0, - "reward": 1.732421875, - "reward_std": 0.6683371663093567, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.7578125, - "rewards/format_reward/std": 0.42882615327835083, - "rewards/tag_count_reward/mean": 0.890625, - "rewards/tag_count_reward/std": 0.22118698060512543, + "grad_norm": 2.092193365097046, + "kl": 2.23046875, + "learning_rate": 7.376903820806507e-07, + "loss": 0.1074, + "num_tokens": 792001332.0, + "reward": 1.0224609375, + "reward_std": 0.2799791693687439, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.18062397837638855, "step": 1251 }, { @@ -36294,27 +36294,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 828.90234375, - "completions/mean_terminated_length": 763.68310546875, - "completions/min_length": 31.0, - "completions/min_terminated_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1779.0, + "completions/max_terminated_length": 1779.0, + "completions/mean_length": 780.32421875, + "completions/mean_terminated_length": 780.32421875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.4274131603652812, - "grad_norm": 2.0579171180725098, - "kl": 6.4375, - "learning_rate": 7.36892421940505e-07, - "loss": 0.4052, - "num_tokens": 719825644.0, - "reward": 1.744140625, - "reward_std": 0.6250649690628052, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.8984375, - "rewards/tag_count_reward/std": 0.22434400022029877, + "grad_norm": 3.0212109088897705, + "kl": 2.6015625, + "learning_rate": 7.372028141263105e-07, + "loss": 0.1017, + "num_tokens": 792472490.0, + "reward": 1.001953125, + "reward_std": 0.2717844843864441, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.19352872669696808, "step": 1252 }, { @@ -36323,27 +36323,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1969.0, - "completions/mean_length": 863.185546875, - "completions/mean_terminated_length": 797.226806640625, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 1896.0, + "completions/mean_length": 789.748046875, + "completions/mean_terminated_length": 784.8137817382812, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, "epoch": 0.427754544678672, - "grad_norm": 2.6889405250549316, - "kl": 6.8984375, - "learning_rate": 7.364046043540699e-07, - "loss": 0.4256, - "num_tokens": 720349179.0, - "reward": 1.7109375, - "reward_std": 0.6374015808105469, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.76171875, - "rewards/format_reward/std": 0.42644867300987244, - "rewards/tag_count_reward/mean": 0.88671875, - "rewards/tag_count_reward/std": 0.23845018446445465, + "grad_norm": 3.5705649852752686, + "kl": 2.552734375, + "learning_rate": 7.36714980270184e-07, + "loss": 0.1059, + "num_tokens": 792958425.0, + "reward": 1.01513671875, + "reward_std": 0.2775269150733948, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.1937304437160492, "step": 1253 }, { @@ -36352,27 +36352,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 849.505859375, - "completions/mean_terminated_length": 772.2640380859375, - "completions/min_length": 69.0, - "completions/min_terminated_length": 69.0, + "completions/max_terminated_length": 1764.0, + "completions/mean_length": 804.99609375, + "completions/mean_terminated_length": 797.6699829101562, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.42809592899206284, - "grad_norm": 1.7696506977081299, - "kl": 7.5, - "learning_rate": 7.359165222003927e-07, - "loss": 0.4706, - "num_tokens": 720863150.0, - "reward": 1.697265625, - "reward_std": 0.6631327867507935, - "rewards/accuracy_reward/mean": 0.06451612710952759, - "rewards/accuracy_reward/std": 0.2459181249141693, - "rewards/format_reward/mean": 0.755859375, - "rewards/format_reward/std": 0.42999663949012756, - "rewards/tag_count_reward/mean": 0.87890625, - "rewards/tag_count_reward/std": 0.24125482141971588, + "grad_norm": 4.68848991394043, + "kl": 3.61328125, + "learning_rate": 7.362268812051872e-07, + "loss": 0.1806, + "num_tokens": 793449607.0, + "reward": 1.0009765625, + "reward_std": 0.28966033458709717, + "rewards/accuracy_reward/mean": 0.0927419364452362, + "rewards/accuracy_reward/std": 0.2903633117675781, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.20129980146884918, "step": 1254 }, { @@ -36381,27 +36381,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 802.404296875, - "completions/mean_terminated_length": 754.3995971679688, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 787.98828125, + "completions/mean_terminated_length": 785.5225219726562, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.4284373133054536, - "grad_norm": 2.0506370067596436, - "kl": 5.53125, - "learning_rate": 7.354281761722168e-07, - "loss": 0.3899, - "num_tokens": 721352189.0, - "reward": 1.81494140625, - "reward_std": 0.5642163753509521, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.19817768037319183, + "grad_norm": 6.400665283203125, + "kl": 3.84375, + "learning_rate": 7.357385176246135e-07, + "loss": 0.167, + "num_tokens": 793931265.0, + "reward": 0.998046875, + "reward_std": 0.28555774688720703, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.2109626978635788, "step": 1255 }, { @@ -36410,27 +36410,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 787.2421875, - "completions/mean_terminated_length": 730.63671875, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1891.0, + "completions/max_terminated_length": 1891.0, + "completions/mean_length": 751.001953125, + "completions/mean_terminated_length": 751.001953125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.4287786976188444, - "grad_norm": 2.414344072341919, - "kl": 6.5546875, - "learning_rate": 7.349395669626601e-07, - "loss": 0.4446, - "num_tokens": 721829449.0, - "reward": 1.8193359375, - "reward_std": 0.6030614376068115, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.20950597524642944, + "grad_norm": 1.992377519607544, + "kl": 2.15234375, + "learning_rate": 7.352498902221315e-07, + "loss": 0.085, + "num_tokens": 794389970.0, + "reward": 1.0068359375, + "reward_std": 0.23750057816505432, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.1651252657175064, "step": 1256 }, { @@ -36439,27 +36439,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1851.0, - "completions/mean_length": 751.89453125, - "completions/mean_terminated_length": 715.4578247070312, - "completions/min_length": 38.0, - "completions/min_terminated_length": 38.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 747.642578125, + "completions/mean_terminated_length": 745.0978393554688, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.4291200819322352, - "grad_norm": 0.9160401225090027, - "kl": 5.8359375, - "learning_rate": 7.344506952652141e-07, - "loss": 0.3673, - "num_tokens": 722288739.0, - "reward": 1.8515625, - "reward_std": 0.4972970485687256, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.1939331740140915, + "grad_norm": 2.9603893756866455, + "kl": 2.978515625, + "learning_rate": 7.347609996917849e-07, + "loss": 0.1505, + "num_tokens": 794847083.0, + "reward": 1.01318359375, + "reward_std": 0.24392500519752502, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.18116475641727448, "step": 1257 }, { @@ -36468,27 +36468,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 796.3515625, - "completions/mean_terminated_length": 755.9757690429688, - "completions/min_length": 249.0, - "completions/min_terminated_length": 249.0, + "completions/max_terminated_length": 1768.0, + "completions/mean_length": 791.625, + "completions/mean_terminated_length": 786.6980590820312, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.42946146624562603, - "grad_norm": 1.0107033252716064, - "kl": 5.984375, - "learning_rate": 7.339615617737427e-07, - "loss": 0.3545, - "num_tokens": 722772663.0, - "reward": 1.82080078125, - "reward_std": 0.5160529613494873, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.18617989122867584, + "grad_norm": 2.724947214126587, + "kl": 3.6171875, + "learning_rate": 7.342718467279908e-07, + "loss": 0.1886, + "num_tokens": 795328587.0, + "reward": 0.9609375, + "reward_std": 0.27307432889938354, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.896484375, + "rewards/tag_count_reward/std": 0.21958273649215698, "step": 1258 }, { @@ -36497,27 +36497,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 763.390625, - "completions/mean_terminated_length": 708.4481201171875, - "completions/min_length": 73.0, - "completions/min_terminated_length": 73.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1948.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 760.69140625, + "completions/mean_terminated_length": 760.69140625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.4298028505590168, - "grad_norm": 1.9055343866348267, - "kl": 9.0390625, - "learning_rate": 7.334721671824814e-07, - "loss": 0.5632, - "num_tokens": 723239583.0, - "reward": 1.76806640625, - "reward_std": 0.5623615384101868, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, + "grad_norm": 2.666703462600708, + "kl": 3.6640625, + "learning_rate": 7.337824320255394e-07, + "loss": 0.1743, + "num_tokens": 795794125.0, + "reward": 0.95166015625, + "reward_std": 0.22550147771835327, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17416280508041382, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.20063117146492004, + "rewards/tag_count_reward/std": 0.19755955040454865, "step": 1259 }, { @@ -36526,27 +36526,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 709.197265625, - "completions/mean_terminated_length": 682.5278930664062, - "completions/min_length": 94.0, - "completions/min_terminated_length": 94.0, + "completions/max_terminated_length": 1836.0, + "completions/mean_length": 706.61328125, + "completions/mean_terminated_length": 701.3529663085938, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.4301442348724076, - "grad_norm": 1.7391200065612793, - "kl": 6.6015625, - "learning_rate": 7.329825121860363e-07, - "loss": 0.378, - "num_tokens": 723684116.0, - "reward": 1.8203125, - "reward_std": 0.5584942102432251, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.1948670893907547, + "grad_norm": 3.880563497543335, + "kl": 3.55078125, + "learning_rate": 7.33292756279592e-07, + "loss": 0.2, + "num_tokens": 796237335.0, + "reward": 0.99169921875, + "reward_std": 0.26497775316238403, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.19759339094161987, "step": 1260 }, { @@ -36555,27 +36555,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1864.0, - "completions/mean_length": 751.787109375, - "completions/mean_terminated_length": 720.6780395507812, - "completions/min_length": 232.0, - "completions/min_terminated_length": 232.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1763.0, + "completions/max_terminated_length": 1763.0, + "completions/mean_length": 728.67578125, + "completions/mean_terminated_length": 728.67578125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.4304856191857984, - "grad_norm": 1.2627638578414917, - "kl": 5.83984375, - "learning_rate": 7.32492597479383e-07, - "loss": 0.3666, - "num_tokens": 724143479.0, - "reward": 1.869140625, - "reward_std": 0.44199299812316895, - "rewards/accuracy_reward/mean": 0.05443548411130905, - "rewards/accuracy_reward/std": 0.227104052901268, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.18036192655563354, + "grad_norm": 1.7961816787719727, + "kl": 3.0703125, + "learning_rate": 7.32802820185682e-07, + "loss": 0.1612, + "num_tokens": 796684865.0, + "reward": 0.98388671875, + "reward_std": 0.23252499103546143, + "rewards/accuracy_reward/mean": 0.07056451588869095, + "rewards/accuracy_reward/std": 0.25635457038879395, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.20228178799152374, "step": 1261 }, { @@ -36584,27 +36584,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 743.861328125, - "completions/mean_terminated_length": 717.8825073242188, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 782.51953125, + "completions/mean_terminated_length": 777.556884765625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.43082700349918923, - "grad_norm": 1.1336578130722046, - "kl": 4.88671875, - "learning_rate": 7.32002423757866e-07, - "loss": 0.3102, - "num_tokens": 724602064.0, - "reward": 1.8837890625, - "reward_std": 0.4746038317680359, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.17237325012683868, + "grad_norm": 2.577136278152466, + "kl": 3.8515625, + "learning_rate": 7.32312624439711e-07, + "loss": 0.2034, + "num_tokens": 797163243.0, + "reward": 0.98681640625, + "reward_std": 0.2654762268066406, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.2004214972257614, "step": 1262 }, { @@ -36613,27 +36613,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 749.705078125, - "completions/mean_terminated_length": 721.1995849609375, - "completions/min_length": 72.0, - "completions/min_terminated_length": 72.0, + "completions/max_terminated_length": 1854.0, + "completions/mean_length": 733.80859375, + "completions/mean_terminated_length": 726.0628662109375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.43116838781258, - "grad_norm": 2.871920347213745, - "kl": 6.328125, - "learning_rate": 7.315119917171972e-07, - "loss": 0.4186, - "num_tokens": 725058857.0, - "reward": 1.84521484375, - "reward_std": 0.4899643063545227, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.1784812957048416, + "grad_norm": 6.863286972045898, + "kl": 5.65625, + "learning_rate": 7.318221697379505e-07, + "loss": 0.2847, + "num_tokens": 797611897.0, + "reward": 0.923828125, + "reward_std": 0.2940230071544647, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.865234375, + "rewards/tag_count_reward/std": 0.24660581350326538, "step": 1263 }, { @@ -36642,27 +36642,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 794.529296875, - "completions/mean_terminated_length": 746.2210693359375, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1657.0, + "completions/max_terminated_length": 1657.0, + "completions/mean_length": 774.037109375, + "completions/mean_terminated_length": 774.037109375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.4315097721259708, - "grad_norm": 1.7654612064361572, - "kl": 7.21875, - "learning_rate": 7.310213020534549e-07, - "loss": 0.4814, - "num_tokens": 725552280.0, - "reward": 1.75830078125, - "reward_std": 0.5049154162406921, - "rewards/accuracy_reward/mean": 0.0078125, - "rewards/accuracy_reward/std": 0.08812850713729858, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.2022770643234253, + "grad_norm": 3.185276508331299, + "kl": 4.37109375, + "learning_rate": 7.313314567770397e-07, + "loss": 0.2343, + "num_tokens": 798094828.0, + "reward": 0.9013671875, + "reward_std": 0.2174476981163025, + "rewards/accuracy_reward/mean": 0.01171875, + "rewards/accuracy_reward/std": 0.10772226005792618, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8896484375, + "rewards/tag_count_reward/std": 0.2201455533504486, "step": 1264 }, { @@ -36671,27 +36671,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1970.0, - "completions/mean_length": 740.4453125, - "completions/mean_terminated_length": 687.2926635742188, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1563.0, + "completions/max_terminated_length": 1563.0, + "completions/mean_length": 665.466796875, + "completions/mean_terminated_length": 665.466796875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.4318511564393616, - "grad_norm": 5.806798934936523, - "kl": 9.05859375, - "learning_rate": 7.30530355463084e-07, - "loss": 0.5213, - "num_tokens": 726011420.0, - "reward": 1.80078125, - "reward_std": 0.6177265644073486, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.900390625, - "rewards/tag_count_reward/std": 0.22138561308383942, + "grad_norm": 3.8278932571411133, + "kl": 4.890625, + "learning_rate": 7.308404862539841e-07, + "loss": 0.2506, + "num_tokens": 798515579.0, + "reward": 1.01171875, + "reward_std": 0.3120690584182739, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.890625, + "rewards/tag_count_reward/std": 0.22772592306137085, "step": 1265 }, { @@ -36700,27 +36700,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 758.076171875, - "completions/mean_terminated_length": 724.470947265625, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1911.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 702.96875, + "completions/mean_terminated_length": 702.96875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.43219254075275243, - "grad_norm": 3.785024642944336, - "kl": 8.96875, - "learning_rate": 7.300391526428928e-07, - "loss": 0.5283, - "num_tokens": 726481219.0, - "reward": 1.74169921875, - "reward_std": 0.6067723035812378, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.2009500116109848, + "grad_norm": 3.5347719192504883, + "kl": 3.5625, + "learning_rate": 7.303492588661555e-07, + "loss": 0.2038, + "num_tokens": 798957163.0, + "reward": 0.98193359375, + "reward_std": 0.2749689817428589, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.20919732749462128, "step": 1266 }, { @@ -36729,27 +36729,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 779.455078125, - "completions/mean_terminated_length": 725.1996459960938, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1710.0, + "completions/max_terminated_length": 1710.0, + "completions/mean_length": 692.478515625, + "completions/mean_terminated_length": 692.478515625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, "epoch": 0.4325339250661432, - "grad_norm": 3.3844306468963623, - "kl": 8.3828125, - "learning_rate": 7.295476942900539e-07, - "loss": 0.4816, - "num_tokens": 726966204.0, - "reward": 1.7646484375, - "reward_std": 0.626046895980835, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.76953125, - "rewards/format_reward/std": 0.42154473066329956, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.21096043288707733, + "grad_norm": 5.8749260902404785, + "kl": 4.0234375, + "learning_rate": 7.298577753112905e-07, + "loss": 0.25, + "num_tokens": 799397616.0, + "reward": 0.9833984375, + "reward_std": 0.30790799856185913, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8818359375, + "rewards/tag_count_reward/std": 0.23709788918495178, "step": 1267 }, { @@ -36758,27 +36758,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 788.740234375, - "completions/mean_terminated_length": 713.1325073242188, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1809.0, + "completions/max_terminated_length": 1809.0, + "completions/mean_length": 685.69921875, + "completions/mean_terminated_length": 685.69921875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.432875309379534, - "grad_norm": 3.069222927093506, - "kl": 9.65625, - "learning_rate": 7.290559811021029e-07, - "loss": 0.593, - "num_tokens": 727448071.0, - "reward": 1.783203125, - "reward_std": 0.6747581958770752, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, - "rewards/format_reward/mean": 0.775390625, - "rewards/format_reward/std": 0.41773295402526855, - "rewards/tag_count_reward/mean": 0.880859375, - "rewards/tag_count_reward/std": 0.23609022796154022, + "grad_norm": 2.666879892349243, + "kl": 3.8203125, + "learning_rate": 7.293660362874892e-07, + "loss": 0.1972, + "num_tokens": 799826726.0, + "reward": 1.05419921875, + "reward_std": 0.2964475750923157, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.2084331214427948, "step": 1268 }, { @@ -36787,27 +36787,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 791.298828125, - "completions/mean_terminated_length": 742.8660888671875, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1562.0, + "completions/max_terminated_length": 1562.0, + "completions/mean_length": 734.87109375, + "completions/mean_terminated_length": 734.87109375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.4332166936929248, - "grad_norm": 1.4948716163635254, - "kl": 7.2265625, - "learning_rate": 7.285640137769363e-07, - "loss": 0.4889, - "num_tokens": 727925920.0, - "reward": 1.78662109375, - "reward_std": 0.6075412034988403, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.80078125, - "rewards/format_reward/std": 0.39980348944664, - "rewards/tag_count_reward/mean": 0.89794921875, - "rewards/tag_count_reward/std": 0.22924767434597015, + "grad_norm": 2.4318792819976807, + "kl": 3.84765625, + "learning_rate": 7.288740424932151e-07, + "loss": 0.1954, + "num_tokens": 800275684.0, + "reward": 0.9716796875, + "reward_std": 0.29261159896850586, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8759765625, + "rewards/tag_count_reward/std": 0.23822365701198578, "step": 1269 }, { @@ -36816,27 +36816,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 837.33984375, - "completions/mean_terminated_length": 795.7616577148438, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_terminated_length": 1742.0, + "completions/mean_length": 725.107421875, + "completions/mean_terminated_length": 722.5186157226562, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.43355807800631563, - "grad_norm": 1.5656808614730835, - "kl": 6.9765625, - "learning_rate": 7.280717930128119e-07, - "loss": 0.451, - "num_tokens": 728426094.0, - "reward": 1.7724609375, - "reward_std": 0.6285260915756226, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.779296875, - "rewards/format_reward/std": 0.4151262938976288, - "rewards/tag_count_reward/mean": 0.8955078125, - "rewards/tag_count_reward/std": 0.20588061213493347, + "grad_norm": 5.8231024742126465, + "kl": 3.78515625, + "learning_rate": 7.283817946272933e-07, + "loss": 0.1913, + "num_tokens": 800718395.0, + "reward": 0.9931640625, + "reward_std": 0.29661667346954346, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8779296875, + "rewards/tag_count_reward/std": 0.22608990967273712, "step": 1270 }, { @@ -36845,27 +36845,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 772.10546875, - "completions/mean_terminated_length": 706.6078491210938, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_terminated_length": 1617.0, + "completions/mean_length": 667.91796875, + "completions/mean_terminated_length": 665.2172241210938, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.4338994623197064, - "grad_norm": 3.605860710144043, - "kl": 6.1484375, - "learning_rate": 7.275793195083474e-07, - "loss": 0.4371, - "num_tokens": 728898676.0, - "reward": 1.75, - "reward_std": 0.604926347732544, - "rewards/accuracy_reward/mean": 0.05645161122083664, - "rewards/accuracy_reward/std": 0.23102475702762604, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.904296875, - "rewards/tag_count_reward/std": 0.2175542116165161, + "grad_norm": 5.3429765701293945, + "kl": 3.8828125, + "learning_rate": 7.278892933889098e-07, + "loss": 0.238, + "num_tokens": 801137633.0, + "reward": 0.95068359375, + "reward_std": 0.24944081902503967, + "rewards/accuracy_reward/mean": 0.052419353276491165, + "rewards/accuracy_reward/std": 0.22309619188308716, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89990234375, + "rewards/tag_count_reward/std": 0.21068565547466278, "step": 1271 }, { @@ -36874,27 +36874,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 777.689453125, - "completions/mean_terminated_length": 754.960205078125, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_terminated_length": 1576.0, + "completions/mean_length": 733.552734375, + "completions/mean_terminated_length": 730.9804077148438, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.4342408466330972, - "grad_norm": 2.478461265563965, - "kl": 3.43359375, - "learning_rate": 7.270865939625183e-07, - "loss": 0.2044, - "num_tokens": 729375685.0, - "reward": 1.83935546875, - "reward_std": 0.4839392304420471, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.175620898604393, + "grad_norm": 7.208449840545654, + "kl": 4.2890625, + "learning_rate": 7.273965394776105e-07, + "loss": 0.1718, + "num_tokens": 801592044.0, + "reward": 0.96044921875, + "reward_std": 0.2888133227825165, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87646484375, + "rewards/tag_count_reward/std": 0.23615851998329163, "step": 1272 }, { @@ -36903,27 +36903,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 754.076171875, - "completions/mean_terminated_length": 728.3008422851562, - "completions/min_length": 10.0, - "completions/min_terminated_length": 10.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 710.37109375, + "completions/mean_terminated_length": 702.4872436523438, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, "epoch": 0.434582230946488, - "grad_norm": 2.1076242923736572, - "kl": 4.44921875, - "learning_rate": 7.265936170746588e-07, - "loss": 0.2837, - "num_tokens": 729833676.0, - "reward": 1.83447265625, - "reward_std": 0.5441898107528687, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.18640044331550598, + "grad_norm": 3.591843843460083, + "kl": 5.7421875, + "learning_rate": 7.269035335933e-07, + "loss": 0.2721, + "num_tokens": 802027658.0, + "reward": 0.94287109375, + "reward_std": 0.3252767324447632, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.84716796875, + "rewards/tag_count_reward/std": 0.25404128432273865, "step": 1273 }, { @@ -36932,27 +36932,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 845.30078125, - "completions/mean_terminated_length": 778.346435546875, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/max_terminated_length": 1592.0, + "completions/mean_length": 700.544921875, + "completions/mean_terminated_length": 695.2608032226562, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.4349236152598788, - "grad_norm": 1.2890641689300537, - "kl": 7.0234375, - "learning_rate": 7.261003895444593e-07, - "loss": 0.4465, - "num_tokens": 730338278.0, - "reward": 1.720703125, - "reward_std": 0.6220524907112122, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.763671875, - "rewards/format_reward/std": 0.42524150013923645, - "rewards/tag_count_reward/mean": 0.890625, - "rewards/tag_count_reward/std": 0.22229015827178955, + "grad_norm": 6.288658618927002, + "kl": 4.76953125, + "learning_rate": 7.264102764362412e-07, + "loss": 0.2099, + "num_tokens": 802458145.0, + "reward": 0.94970703125, + "reward_std": 0.32947617769241333, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86376953125, + "rewards/tag_count_reward/std": 0.24203728139400482, "step": 1274 }, { @@ -36961,27 +36961,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 857.912109375, - "completions/mean_terminated_length": 801.9365844726562, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 1849.0, + "completions/mean_length": 769.71875, + "completions/mean_terminated_length": 759.653564453125, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, "epoch": 0.4352649995732696, - "grad_norm": 2.3086037635803223, - "kl": 7.453125, - "learning_rate": 7.256069120719661e-07, - "loss": 0.4659, - "num_tokens": 730845465.0, - "reward": 1.744140625, - "reward_std": 0.6320393085479736, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.78125, - "rewards/format_reward/std": 0.41380295157432556, - "rewards/tag_count_reward/mean": 0.88671875, - "rewards/tag_count_reward/std": 0.23431077599525452, + "grad_norm": 4.0882744789123535, + "kl": 3.73046875, + "learning_rate": 7.259167687070534e-07, + "loss": 0.1843, + "num_tokens": 802920177.0, + "reward": 0.95556640625, + "reward_std": 0.26851123571395874, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.88330078125, + "rewards/tag_count_reward/std": 0.23024585843086243, "step": 1275 }, { @@ -36990,27 +36990,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 780.814453125, - "completions/mean_terminated_length": 742.5693969726562, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1611.0, + "completions/max_terminated_length": 1611.0, + "completions/mean_length": 692.216796875, + "completions/mean_terminated_length": 692.216796875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, "epoch": 0.4356063838866604, - "grad_norm": 1.7257732152938843, - "kl": 6.84375, - "learning_rate": 7.2511318535758e-07, - "loss": 0.408, - "num_tokens": 731321274.0, - "reward": 1.724609375, - "reward_std": 0.6101690530776978, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.767578125, - "rewards/format_reward/std": 0.42278963327407837, - "rewards/tag_count_reward/mean": 0.892578125, - "rewards/tag_count_reward/std": 0.21993058919906616, + "grad_norm": 4.951401233673096, + "kl": 3.01171875, + "learning_rate": 7.254230111067126e-07, + "loss": 0.1719, + "num_tokens": 803350624.0, + "reward": 1.0068359375, + "reward_std": 0.2870011627674103, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.20069128274917603, "step": 1276 }, { @@ -37019,27 +37019,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 795.44921875, - "completions/mean_terminated_length": 747.1764526367188, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1602.0, + "completions/max_terminated_length": 1602.0, + "completions/mean_length": 720.58203125, + "completions/mean_terminated_length": 720.58203125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.4359477682000512, - "grad_norm": 2.6494693756103516, - "kl": 7.5625, - "learning_rate": 7.246192101020559e-07, - "loss": 0.4435, - "num_tokens": 731797984.0, - "reward": 1.720703125, - "reward_std": 0.604543149471283, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.767578125, - "rewards/format_reward/std": 0.42278963327407837, - "rewards/tag_count_reward/mean": 0.884765625, - "rewards/tag_count_reward/std": 0.23387810587882996, + "grad_norm": 3.016556978225708, + "kl": 2.84375, + "learning_rate": 7.249290043365487e-07, + "loss": 0.1265, + "num_tokens": 803789002.0, + "reward": 1.03271484375, + "reward_std": 0.29467931389808655, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90185546875, + "rewards/tag_count_reward/std": 0.20869427919387817, "step": 1277 }, { @@ -37048,27 +37048,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1912.0, - "completions/mean_length": 778.177734375, - "completions/mean_terminated_length": 737.2156982421875, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1739.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 757.216796875, + "completions/mean_terminated_length": 757.216796875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.436289152513442, - "grad_norm": 1.5474677085876465, - "kl": 5.46875, - "learning_rate": 7.241249870065014e-07, - "loss": 0.3395, - "num_tokens": 732274187.0, - "reward": 1.7666015625, - "reward_std": 0.5756736397743225, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.9111328125, - "rewards/tag_count_reward/std": 0.20129980146884918, + "grad_norm": 3.0540530681610107, + "kl": 3.203125, + "learning_rate": 7.244347490982463e-07, + "loss": 0.1638, + "num_tokens": 804254473.0, + "reward": 0.9794921875, + "reward_std": 0.2800787091255188, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9052734375, + "rewards/tag_count_reward/std": 0.21798203885555267, "step": 1278 }, { @@ -37077,27 +37077,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 837.470703125, - "completions/mean_terminated_length": 762.1265869140625, - "completions/min_length": 74.0, - "completions/min_terminated_length": 74.0, + "completions/max_terminated_length": 1725.0, + "completions/mean_length": 771.798828125, + "completions/mean_terminated_length": 769.3013916015625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.4366305368268328, - "grad_norm": 1.7922933101654053, - "kl": 7.546875, - "learning_rate": 7.236305167723758e-07, - "loss": 0.4332, - "num_tokens": 732782540.0, - "reward": 1.7109375, - "reward_std": 0.654686450958252, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.7578125, - "rewards/format_reward/std": 0.42882615327835083, - "rewards/tag_count_reward/mean": 0.873046875, - "rewards/tag_count_reward/std": 0.24679172039031982, + "grad_norm": 4.37945032119751, + "kl": 2.75, + "learning_rate": 7.239402460938427e-07, + "loss": 0.1498, + "num_tokens": 804729202.0, + "reward": 0.994140625, + "reward_std": 0.2735789716243744, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.904296875, + "rewards/tag_count_reward/std": 0.2095356285572052, "step": 1279 }, { @@ -37106,27 +37106,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 832.134765625, - "completions/mean_terminated_length": 782.7093505859375, - "completions/min_length": 95.0, - "completions/min_terminated_length": 95.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1572.0, + "completions/max_terminated_length": 1572.0, + "completions/mean_length": 762.8359375, + "completions/mean_terminated_length": 762.8359375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, "epoch": 0.4369719211402236, - "grad_norm": 0.7395869493484497, - "kl": 7.3515625, - "learning_rate": 7.231358001014891e-07, - "loss": 0.4439, - "num_tokens": 733279329.0, - "reward": 1.71826171875, - "reward_std": 0.5961915850639343, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.775390625, - "rewards/format_reward/std": 0.41773295402526855, - "rewards/tag_count_reward/mean": 0.89404296875, - "rewards/tag_count_reward/std": 0.21869266033172607, + "grad_norm": 3.2914984226226807, + "kl": 2.6953125, + "learning_rate": 7.234454960257271e-07, + "loss": 0.133, + "num_tokens": 805190510.0, + "reward": 0.96240234375, + "reward_std": 0.24730157852172852, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.20337004959583282, "step": 1280 }, { @@ -37135,27 +37135,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1906.0, - "completions/mean_length": 848.68359375, - "completions/mean_terminated_length": 789.7008056640625, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1950.0, + "completions/max_terminated_length": 1950.0, + "completions/mean_length": 790.53125, + "completions/mean_terminated_length": 790.53125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.4373133054536144, - "grad_norm": 1.4757400751113892, - "kl": 6.90625, - "learning_rate": 7.22640837696001e-07, - "loss": 0.4081, - "num_tokens": 733801391.0, - "reward": 1.69677734375, - "reward_std": 0.6439635753631592, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.751953125, - "rewards/format_reward/std": 0.4323015511035919, - "rewards/tag_count_reward/mean": 0.88427734375, - "rewards/tag_count_reward/std": 0.22807340323925018, + "grad_norm": 1.8159061670303345, + "kl": 2.611328125, + "learning_rate": 7.229504995966393e-07, + "loss": 0.126, + "num_tokens": 805682798.0, + "reward": 1.0, + "reward_std": 0.26393765211105347, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.20979996025562286, "step": 1281 }, { @@ -37164,27 +37164,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 739.98046875, - "completions/mean_terminated_length": 697.7862548828125, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1844.0, + "completions/max_terminated_length": 1844.0, + "completions/mean_length": 714.013671875, + "completions/mean_terminated_length": 714.013671875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.4376546897670052, - "grad_norm": 2.648240804672241, - "kl": 5.7734375, - "learning_rate": 7.221456302584202e-07, - "loss": 0.3839, - "num_tokens": 734254277.0, - "reward": 1.78564453125, - "reward_std": 0.5704188346862793, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.21203739941120148, + "grad_norm": 3.124727964401245, + "kl": 3.44140625, + "learning_rate": 7.224552575096699e-07, + "loss": 0.1595, + "num_tokens": 806122389.0, + "reward": 0.95849609375, + "reward_std": 0.25195011496543884, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90185546875, + "rewards/tag_count_reward/std": 0.21333131194114685, "step": 1282 }, { @@ -37193,27 +37193,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1829.0, - "completions/mean_length": 827.361328125, - "completions/mean_terminated_length": 754.0724487304688, - "completions/min_length": 66.0, - "completions/min_terminated_length": 66.0, + "completions/max_terminated_length": 1808.0, + "completions/mean_length": 768.068359375, + "completions/mean_terminated_length": 765.5635986328125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.437996074080396, - "grad_norm": 1.8201979398727417, - "kl": 5.515625, - "learning_rate": 7.216501784916032e-07, - "loss": 0.3247, - "num_tokens": 734752654.0, - "reward": 1.71630859375, - "reward_std": 0.5852100849151611, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.88818359375, - "rewards/tag_count_reward/std": 0.23160721361637115, + "grad_norm": 3.7241928577423096, + "kl": 3.6640625, + "learning_rate": 7.219597704682572e-07, + "loss": 0.1596, + "num_tokens": 806590408.0, + "reward": 0.93310546875, + "reward_std": 0.2621708810329437, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87841796875, + "rewards/tag_count_reward/std": 0.22770023345947266, "step": 1283 }, { @@ -37222,27 +37222,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 767.416015625, - "completions/mean_terminated_length": 709.9203491210938, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1555.0, + "completions/mean_length": 675.97265625, + "completions/mean_terminated_length": 673.2876586914062, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.4383374583937868, - "grad_norm": 2.1691770553588867, - "kl": 5.828125, - "learning_rate": 7.211544830987533e-07, - "loss": 0.4291, - "num_tokens": 735235955.0, - "reward": 1.826171875, - "reward_std": 0.5387592315673828, - "rewards/accuracy_reward/mean": 0.06653226166963577, - "rewards/accuracy_reward/std": 0.2494617998600006, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.2081446796655655, + "grad_norm": 2.2340805530548096, + "kl": 3.7421875, + "learning_rate": 7.214640391761887e-07, + "loss": 0.1735, + "num_tokens": 807026890.0, + "reward": 0.9765625, + "reward_std": 0.29601624608039856, + "rewards/accuracy_reward/mean": 0.07661290466785431, + "rewards/accuracy_reward/std": 0.2662447690963745, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.21724654734134674, "step": 1284 }, { @@ -37251,27 +37251,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1964.0, - "completions/mean_length": 839.587890625, - "completions/mean_terminated_length": 782.75048828125, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1592.0, + "completions/max_terminated_length": 1592.0, + "completions/mean_length": 761.462890625, + "completions/mean_terminated_length": 761.462890625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.4386788427071776, - "grad_norm": 1.0244237184524536, - "kl": 7.5546875, - "learning_rate": 7.206585447834188e-07, - "loss": 0.4849, - "num_tokens": 735737504.0, - "reward": 1.7470703125, - "reward_std": 0.6523882150650024, - "rewards/accuracy_reward/mean": 0.06653226166963577, - "rewards/accuracy_reward/std": 0.2494617998600006, - "rewards/format_reward/mean": 0.794921875, - "rewards/format_reward/std": 0.4041535556316376, - "rewards/tag_count_reward/mean": 0.8876953125, - "rewards/tag_count_reward/std": 0.23530170321464539, + "grad_norm": 2.277294635772705, + "kl": 3.9765625, + "learning_rate": 7.209680643375978e-07, + "loss": 0.189, + "num_tokens": 807488439.0, + "reward": 0.9296875, + "reward_std": 0.2788533866405487, + "rewards/accuracy_reward/mean": 0.060483869165182114, + "rewards/accuracy_reward/std": 0.2386218160390854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.2381935715675354, "step": 1285 }, { @@ -37280,27 +37280,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 788.748046875, - "completions/mean_terminated_length": 748.1270141601562, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 1658.0, + "completions/mean_length": 746.6875, + "completions/mean_terminated_length": 744.140869140625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, "epoch": 0.4390202270205684, - "grad_norm": 1.1738215684890747, - "kl": 6.421875, - "learning_rate": 7.201623642494943e-07, - "loss": 0.3847, - "num_tokens": 736220559.0, - "reward": 1.77685546875, - "reward_std": 0.5522069931030273, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.2036283165216446, + "grad_norm": 4.166876316070557, + "kl": 4.5859375, + "learning_rate": 7.204718466569645e-07, + "loss": 0.1855, + "num_tokens": 807949959.0, + "reward": 0.90478515625, + "reward_std": 0.283225953578949, + "rewards/accuracy_reward/mean": 0.037109375, + "rewards/accuracy_reward/std": 0.18921469151973724, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.86572265625, + "rewards/tag_count_reward/std": 0.24363093078136444, "step": 1286 }, { @@ -37309,27 +37309,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1848.0, - "completions/mean_length": 762.56640625, - "completions/mean_terminated_length": 721.1007690429688, - "completions/min_length": 34.0, - "completions/min_terminated_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1724.0, + "completions/max_terminated_length": 1724.0, + "completions/mean_length": 681.1953125, + "completions/mean_terminated_length": 681.1953125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.4393616113339592, - "grad_norm": 2.3395161628723145, - "kl": 6.890625, - "learning_rate": 7.196659422012166e-07, - "loss": 0.3967, - "num_tokens": 736686993.0, - "reward": 1.77490234375, - "reward_std": 0.5984107851982117, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.20386749505996704, + "grad_norm": 2.166759490966797, + "kl": 4.859375, + "learning_rate": 7.199753868391138e-07, + "loss": 0.2633, + "num_tokens": 808374731.0, + "reward": 0.9580078125, + "reward_std": 0.2841224670410156, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8935546875, + "rewards/tag_count_reward/std": 0.2170507311820984, "step": 1287 }, { @@ -37338,27 +37338,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1885.0, - "completions/mean_length": 684.34765625, - "completions/mean_terminated_length": 659.9483032226562, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 663.521484375, + "completions/mean_terminated_length": 660.8121337890625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, "epoch": 0.43970299564735, - "grad_norm": 2.5489094257354736, - "kl": 5.8671875, - "learning_rate": 7.191692793431663e-07, - "loss": 0.3417, - "num_tokens": 737116659.0, - "reward": 1.865234375, - "reward_std": 0.4733661413192749, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.180202916264534, + "grad_norm": 1.9397300481796265, + "kl": 4.40625, + "learning_rate": 7.194786855892135e-07, + "loss": 0.2377, + "num_tokens": 808793734.0, + "reward": 0.953125, + "reward_std": 0.2842934727668762, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.888671875, + "rewards/tag_count_reward/std": 0.22515182197093964, "step": 1288 }, { @@ -37367,27 +37367,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 777.376953125, - "completions/mean_terminated_length": 723.0325927734375, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1617.0, + "completions/max_terminated_length": 1617.0, + "completions/mean_length": 742.556640625, + "completions/mean_terminated_length": 742.556640625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, "epoch": 0.4400443799607408, - "grad_norm": 1.7061121463775635, - "kl": 7.9921875, - "learning_rate": 7.186723763802654e-07, - "loss": 0.5473, - "num_tokens": 737590532.0, - "reward": 1.763671875, - "reward_std": 0.5881055593490601, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.908203125, - "rewards/tag_count_reward/std": 0.21127952635288239, + "grad_norm": 2.5763185024261475, + "kl": 3.2265625, + "learning_rate": 7.18981743612776e-07, + "loss": 0.1577, + "num_tokens": 809249779.0, + "reward": 0.951171875, + "reward_std": 0.2588649094104767, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.900390625, + "rewards/tag_count_reward/std": 0.21351096034049988, "step": 1289 }, { @@ -37396,27 +37396,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1968.0, - "completions/mean_length": 720.234375, - "completions/mean_terminated_length": 685.643310546875, - "completions/min_length": 46.0, - "completions/min_terminated_length": 46.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 717.91796875, + "completions/mean_terminated_length": 712.7020263671875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, "epoch": 0.4403857642741316, - "grad_norm": 1.4125784635543823, - "kl": 6.5546875, - "learning_rate": 7.181752340177769e-07, - "loss": 0.396, - "num_tokens": 738035068.0, - "reward": 1.77880859375, - "reward_std": 0.5477635860443115, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.1869737207889557, + "grad_norm": 3.292499542236328, + "kl": 3.9296875, + "learning_rate": 7.184845616156543e-07, + "loss": 0.2346, + "num_tokens": 809693129.0, + "reward": 0.94580078125, + "reward_std": 0.2573530673980713, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89892578125, + "rewards/tag_count_reward/std": 0.21368040144443512, "step": 1290 }, { @@ -37425,27 +37425,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 764.392578125, - "completions/mean_terminated_length": 725.65185546875, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 749.140625, + "completions/mean_terminated_length": 746.5988159179688, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.4407271485875224, - "grad_norm": 1.820510745048523, - "kl": 5.8515625, - "learning_rate": 7.176778529613035e-07, - "loss": 0.373, - "num_tokens": 738512837.0, - "reward": 1.7880859375, - "reward_std": 0.5815557241439819, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.798828125, - "rewards/format_reward/std": 0.4012683033943176, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.19077126681804657, + "grad_norm": 2.654452085494995, + "kl": 3.4453125, + "learning_rate": 7.17987140304043e-07, + "loss": 0.203, + "num_tokens": 810163089.0, + "reward": 0.9482421875, + "reward_std": 0.2943015694618225, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8798828125, + "rewards/tag_count_reward/std": 0.22659650444984436, "step": 1291 }, { @@ -37454,27 +37454,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1970.0, - "completions/mean_length": 819.72265625, - "completions/mean_terminated_length": 761.950927734375, - "completions/min_length": 192.0, - "completions/min_terminated_length": 192.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1661.0, + "completions/max_terminated_length": 1661.0, + "completions/mean_length": 714.982421875, + "completions/mean_terminated_length": 714.982421875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.4410685329009132, - "grad_norm": 3.221315383911133, - "kl": 7.8125, - "learning_rate": 7.171802339167864e-07, - "loss": 0.4631, - "num_tokens": 739009815.0, - "reward": 1.81591796875, - "reward_std": 0.6045268177986145, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.90380859375, - "rewards/tag_count_reward/std": 0.21649256348609924, + "grad_norm": 2.422903299331665, + "kl": 3.4296875, + "learning_rate": 7.174894803844765e-07, + "loss": 0.1733, + "num_tokens": 810606440.0, + "reward": 1.04541015625, + "reward_std": 0.3446163535118103, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.88525390625, + "rewards/tag_count_reward/std": 0.22479026019573212, "step": 1292 }, { @@ -37483,27 +37483,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 836.876953125, - "completions/mean_terminated_length": 774.704345703125, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 1592.0, + "completions/mean_length": 747.083984375, + "completions/mean_terminated_length": 739.41650390625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.441409917214304, - "grad_norm": 1.7177226543426514, - "kl": 7.7734375, - "learning_rate": 7.16682377590505e-07, - "loss": 0.4845, - "num_tokens": 739515048.0, - "reward": 1.744140625, - "reward_std": 0.6319704055786133, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.779296875, - "rewards/format_reward/std": 0.4151262938976288, - "rewards/tag_count_reward/mean": 0.8984375, - "rewards/tag_count_reward/std": 0.21993927657604218, + "grad_norm": 3.93560528755188, + "kl": 4.21484375, + "learning_rate": 7.169915825638277e-07, + "loss": 0.2202, + "num_tokens": 811065699.0, + "reward": 0.90625, + "reward_std": 0.2945861220359802, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.849609375, + "rewards/tag_count_reward/std": 0.24747224152088165, "step": 1293 }, { @@ -37511,28 +37511,28 @@ "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, - "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1799.0, - "completions/mean_length": 725.666015625, - "completions/mean_terminated_length": 699.32470703125, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1797.0, + "completions/max_terminated_length": 1797.0, + "completions/mean_length": 689.666015625, + "completions/mean_terminated_length": 689.666015625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.4417513015276948, - "grad_norm": 1.3130954504013062, - "kl": 5.3203125, - "learning_rate": 7.161842846890751e-07, - "loss": 0.314, - "num_tokens": 739961325.0, - "reward": 1.79931640625, - "reward_std": 0.5158872604370117, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.18322066962718964, + "grad_norm": 3.5508759021759033, + "kl": 3.49609375, + "learning_rate": 7.164934475493081e-07, + "loss": 0.1786, + "num_tokens": 811493544.0, + "reward": 0.9189453125, + "reward_std": 0.27675721049308777, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.8759765625, + "rewards/tag_count_reward/std": 0.2340802550315857, "step": 1294 }, { @@ -37541,27 +37541,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1979.0, - "completions/mean_length": 787.35546875, - "completions/mean_terminated_length": 733.4379272460938, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 1607.0, + "completions/mean_length": 761.806640625, + "completions/mean_terminated_length": 759.2896118164062, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, "epoch": 0.4420926858410856, - "grad_norm": 1.687428593635559, - "kl": 5.6015625, - "learning_rate": 7.156859559194488e-07, - "loss": 0.357, - "num_tokens": 740440867.0, - "reward": 1.78564453125, - "reward_std": 0.5600343942642212, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.19380934536457062, + "grad_norm": 2.3482491970062256, + "kl": 3.40234375, + "learning_rate": 7.159950760484658e-07, + "loss": 0.1707, + "num_tokens": 811960005.0, + "reward": 0.974609375, + "reward_std": 0.2815355062484741, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.888671875, + "rewards/tag_count_reward/std": 0.2179754674434662, "step": 1295 }, { @@ -37570,27 +37570,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 773.626953125, - "completions/mean_terminated_length": 740.4268798828125, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1551.0, + "completions/max_terminated_length": 1551.0, + "completions/mean_length": 691.8984375, + "completions/mean_terminated_length": 691.8984375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.4424340701544764, - "grad_norm": 1.3360075950622559, - "kl": 6.4375, - "learning_rate": 7.151873919889122e-07, - "loss": 0.4097, - "num_tokens": 740915060.0, - "reward": 1.7626953125, - "reward_std": 0.5476801991462708, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.798828125, - "rewards/format_reward/std": 0.4012683033943176, - "rewards/tag_count_reward/mean": 0.9111328125, - "rewards/tag_count_reward/std": 0.20787595212459564, + "grad_norm": 2.6762218475341797, + "kl": 3.3671875, + "learning_rate": 7.154964687691844e-07, + "loss": 0.1773, + "num_tokens": 812392353.0, + "reward": 0.9609375, + "reward_std": 0.27421897649765015, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.888671875, + "rewards/tag_count_reward/std": 0.22731435298919678, "step": 1296 }, { @@ -37599,27 +37599,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1965.0, - "completions/mean_length": 745.232421875, - "completions/mean_terminated_length": 708.6083984375, - "completions/min_length": 31.0, - "completions/min_terminated_length": 31.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 706.103515625, + "completions/mean_terminated_length": 700.8412475585938, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, "epoch": 0.4427754544678672, - "grad_norm": 1.0148136615753174, - "kl": 6.0703125, - "learning_rate": 7.146885936050861e-07, - "loss": 0.3841, - "num_tokens": 741372347.0, - "reward": 1.78076171875, - "reward_std": 0.5385444164276123, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.19901487231254578, + "grad_norm": 3.3708600997924805, + "kl": 4.375, + "learning_rate": 7.149976264196833e-07, + "loss": 0.2532, + "num_tokens": 812829606.0, + "reward": 0.955078125, + "reward_std": 0.2880450487136841, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.892578125, + "rewards/tag_count_reward/std": 0.22488003969192505, "step": 1297 }, { @@ -37628,27 +37628,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 817.35546875, - "completions/mean_terminated_length": 780.2132568359375, - "completions/min_length": 82.0, - "completions/min_terminated_length": 82.0, + "completions/max_terminated_length": 1706.0, + "completions/mean_length": 754.771484375, + "completions/mean_terminated_length": 752.24072265625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.443116838781258, - "grad_norm": 2.377429723739624, - "kl": 5.15625, - "learning_rate": 7.141895614759232e-07, - "loss": 0.3373, - "num_tokens": 741868353.0, - "reward": 1.845703125, - "reward_std": 0.5714106559753418, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.19234009087085724, + "grad_norm": 1.8356916904449463, + "kl": 3.6796875, + "learning_rate": 7.144985497085148e-07, + "loss": 0.1997, + "num_tokens": 813293569.0, + "reward": 1.01513671875, + "reward_std": 0.3359909951686859, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89208984375, + "rewards/tag_count_reward/std": 0.21660728752613068, "step": 1298 }, { @@ -37657,27 +37657,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1875.0, - "completions/mean_length": 756.16796875, - "completions/mean_terminated_length": 703.6544189453125, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 1705.0, + "completions/mean_length": 717.09375, + "completions/mean_terminated_length": 714.4892578125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.4434582230946488, - "grad_norm": 2.2262306213378906, - "kl": 7.6796875, - "learning_rate": 7.136902963097085e-07, - "loss": 0.4547, - "num_tokens": 742327895.0, - "reward": 1.767578125, - "reward_std": 0.5512421131134033, - "rewards/accuracy_reward/mean": 0.038306452333927155, - "rewards/accuracy_reward/std": 0.19212885200977325, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.2178439050912857, + "grad_norm": 1.586843729019165, + "kl": 4.12109375, + "learning_rate": 7.13999239344565e-07, + "loss": 0.2246, + "num_tokens": 813733105.0, + "reward": 0.9404296875, + "reward_std": 0.28585097193717957, + "rewards/accuracy_reward/mean": 0.05443548411130905, + "rewards/accuracy_reward/std": 0.227104052901268, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.8857421875, + "rewards/tag_count_reward/std": 0.22312895953655243, "step": 1299 }, { @@ -37686,27 +37686,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1947.0, - "completions/mean_length": 726.76953125, - "completions/mean_terminated_length": 689.62646484375, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1713.0, + "completions/max_terminated_length": 1713.0, + "completions/mean_length": 690.529296875, + "completions/mean_terminated_length": 690.529296875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, "epoch": 0.4437996074080396, - "grad_norm": 3.62125301361084, - "kl": 9.34375, - "learning_rate": 7.131907988150575e-07, - "loss": 0.5471, - "num_tokens": 742778865.0, - "reward": 1.75634765625, - "reward_std": 0.671599805355072, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.775390625, - "rewards/format_reward/std": 0.41773295402526855, - "rewards/tag_count_reward/mean": 0.88916015625, - "rewards/tag_count_reward/std": 0.2315494418144226, + "grad_norm": 4.408387184143066, + "kl": 3.734375, + "learning_rate": 7.13499696037051e-07, + "loss": 0.2029, + "num_tokens": 814165520.0, + "reward": 1.01318359375, + "reward_std": 0.2975776195526123, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.19194161891937256, "step": 1300 }, { @@ -37715,27 +37715,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 869.12890625, - "completions/mean_terminated_length": 790.5375366210938, - "completions/min_length": 203.0, - "completions/min_terminated_length": 203.0, + "completions/max_terminated_length": 1470.0, + "completions/mean_length": 758.994140625, + "completions/mean_terminated_length": 753.9392700195312, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.4441409917214304, - "grad_norm": 3.7968902587890625, - "kl": 9.2890625, - "learning_rate": 7.126910697009156e-07, - "loss": 0.5645, - "num_tokens": 743310179.0, - "reward": 1.72314453125, - "reward_std": 0.6497185230255127, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.751953125, - "rewards/format_reward/std": 0.4323015511035919, - "rewards/tag_count_reward/mean": 0.88525390625, - "rewards/tag_count_reward/std": 0.23542095720767975, + "grad_norm": 2.2561194896698, + "kl": 3.73828125, + "learning_rate": 7.129999204955214e-07, + "loss": 0.1972, + "num_tokens": 814640445.0, + "reward": 0.978515625, + "reward_std": 0.2816011905670166, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.21211905777454376, "step": 1301 }, { @@ -37744,27 +37744,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 753.142578125, - "completions/mean_terminated_length": 724.7125854492188, - "completions/min_length": 78.0, - "completions/min_terminated_length": 78.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1800.0, + "completions/max_terminated_length": 1800.0, + "completions/mean_length": 723.421875, + "completions/mean_terminated_length": 723.421875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.4444823760348212, - "grad_norm": 1.2830666303634644, - "kl": 6.3125, - "learning_rate": 7.121911096765571e-07, - "loss": 0.3837, - "num_tokens": 743777484.0, - "reward": 1.8046875, - "reward_std": 0.5349550247192383, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.19234009087085724, + "grad_norm": 2.8170838356018066, + "kl": 3.8671875, + "learning_rate": 7.124999134298544e-07, + "loss": 0.2023, + "num_tokens": 815092533.0, + "reward": 0.97265625, + "reward_std": 0.27949732542037964, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.21201092004776, "step": 1302 }, { @@ -37773,27 +37773,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1978.0, - "completions/mean_length": 770.044921875, - "completions/mean_terminated_length": 723.4797973632812, - "completions/min_length": 219.0, - "completions/min_terminated_length": 219.0, + "completions/max_terminated_length": 1710.0, + "completions/mean_length": 738.4921875, + "completions/mean_terminated_length": 735.9295654296875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, "epoch": 0.444823760348212, - "grad_norm": 1.2163110971450806, - "kl": 6.5078125, - "learning_rate": 7.116909194515831e-07, - "loss": 0.4312, - "num_tokens": 744247491.0, - "reward": 1.83203125, - "reward_std": 0.5182892680168152, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.18946701288223267, + "grad_norm": 2.1645700931549072, + "kl": 3.5546875, + "learning_rate": 7.119996755502572e-07, + "loss": 0.1779, + "num_tokens": 815546385.0, + "reward": 0.9375, + "reward_std": 0.2515963912010193, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.1939331740140915, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.896484375, + "rewards/tag_count_reward/std": 0.2184658795595169, "step": 1303 }, { @@ -37802,27 +37802,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 757.763671875, - "completions/mean_terminated_length": 710.7510375976562, - "completions/min_length": 218.0, - "completions/min_terminated_length": 218.0, + "completions/max_terminated_length": 1701.0, + "completions/mean_length": 714.59375, + "completions/mean_terminated_length": 709.36474609375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.4451651446616028, - "grad_norm": 1.347507357597351, - "kl": 7.1015625, - "learning_rate": 7.111904997359229e-07, - "loss": 0.4555, - "num_tokens": 744711338.0, - "reward": 1.80615234375, - "reward_std": 0.543106198310852, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.20237624645233154, + "grad_norm": 2.33807373046875, + "kl": 3.1875, + "learning_rate": 7.114992075672648e-07, + "loss": 0.1557, + "num_tokens": 815988129.0, + "reward": 1.02392578125, + "reward_std": 0.286119282245636, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92236328125, + "rewards/tag_count_reward/std": 0.19833672046661377, "step": 1304 }, { @@ -37831,27 +37831,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 790.521484375, - "completions/mean_terminated_length": 736.7393188476562, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 1702.0, + "completions/mean_length": 721.060546875, + "completions/mean_terminated_length": 718.4638061523438, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.4455065289749936, - "grad_norm": 1.7766551971435547, - "kl": 6.9296875, - "learning_rate": 7.106898512398305e-07, - "loss": 0.461, - "num_tokens": 745189237.0, - "reward": 1.8125, - "reward_std": 0.572502851486206, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.2069661170244217, + "grad_norm": 6.866147994995117, + "kl": 3.53515625, + "learning_rate": 7.10998510191739e-07, + "loss": 0.1699, + "num_tokens": 816430464.0, + "reward": 1.0400390625, + "reward_std": 0.32317155599594116, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9052734375, + "rewards/tag_count_reward/std": 0.20997978746891022, "step": 1305 }, { @@ -37860,27 +37860,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 724.638671875, - "completions/mean_terminated_length": 665.222412109375, - "completions/min_length": 42.0, - "completions/min_terminated_length": 42.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1550.0, + "completions/max_terminated_length": 1550.0, + "completions/mean_length": 694.80859375, + "completions/mean_terminated_length": 694.80859375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.4458479132883844, - "grad_norm": 2.5597076416015625, - "kl": 6.390625, - "learning_rate": 7.101889746738848e-07, - "loss": 0.4223, - "num_tokens": 745635292.0, - "reward": 1.82275390625, - "reward_std": 0.5397883653640747, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.19859671592712402, + "grad_norm": 3.406233072280884, + "kl": 2.912109375, + "learning_rate": 7.104975841348673e-07, + "loss": 0.1439, + "num_tokens": 816861246.0, + "reward": 0.99462890625, + "reward_std": 0.2748178243637085, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91064453125, + "rewards/tag_count_reward/std": 0.2001686543226242, "step": 1306 }, { @@ -37889,27 +37889,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1945.0, - "completions/mean_length": 803.638671875, - "completions/mean_terminated_length": 753.0548706054688, - "completions/min_length": 13.0, - "completions/min_terminated_length": 13.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 753.3046875, + "completions/mean_terminated_length": 750.7710571289062, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.4461892976017752, - "grad_norm": 1.1304194927215576, - "kl": 8.203125, - "learning_rate": 7.096878707489885e-07, - "loss": 0.518, - "num_tokens": 746127459.0, - "reward": 1.7470703125, - "reward_std": 0.6205352544784546, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.79296875, - "rewards/format_reward/std": 0.40557438135147095, - "rewards/tag_count_reward/mean": 0.8994140625, - "rewards/tag_count_reward/std": 0.22369353473186493, + "grad_norm": 1.5010937452316284, + "kl": 3.2890625, + "learning_rate": 7.099964301081621e-07, + "loss": 0.1465, + "num_tokens": 817327642.0, + "reward": 0.9755859375, + "reward_std": 0.2628176212310791, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.19373352825641632, "step": 1307 }, { @@ -37918,27 +37918,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 788.05078125, - "completions/mean_terminated_length": 747.4072265625, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1758.0, + "completions/max_terminated_length": 1758.0, + "completions/mean_length": 783.052734375, + "completions/mean_terminated_length": 783.052734375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.446530681915166, - "grad_norm": 1.2309277057647705, - "kl": 6.25, - "learning_rate": 7.091865401763671e-07, - "loss": 0.381, - "num_tokens": 746606573.0, - "reward": 1.84423828125, - "reward_std": 0.5464041233062744, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.18848544359207153, + "grad_norm": 2.217822790145874, + "kl": 2.48828125, + "learning_rate": 7.0949504882346e-07, + "loss": 0.1096, + "num_tokens": 817804197.0, + "reward": 0.998046875, + "reward_std": 0.23435276746749878, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.18268859386444092, "step": 1308 }, { @@ -37947,27 +37947,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 772.6796875, - "completions/mean_terminated_length": 715.4203491210938, - "completions/min_length": 13.0, - "completions/min_terminated_length": 13.0, + "completions/max_terminated_length": 1688.0, + "completions/mean_length": 726.181640625, + "completions/mean_terminated_length": 723.5949096679688, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, "epoch": 0.4468720662285568, - "grad_norm": 2.441206693649292, - "kl": 8.65625, - "learning_rate": 7.086849836675675e-07, - "loss": 0.5249, - "num_tokens": 747076921.0, - "reward": 1.76611328125, - "reward_std": 0.6479424238204956, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.78125, - "rewards/format_reward/std": 0.41380295157432556, - "rewards/tag_count_reward/mean": 0.89306640625, - "rewards/tag_count_reward/std": 0.22429661452770233, + "grad_norm": 3.846050262451172, + "kl": 2.796875, + "learning_rate": 7.089934409929198e-07, + "loss": 0.1467, + "num_tokens": 818250738.0, + "reward": 1.04052734375, + "reward_std": 0.26566195487976074, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.18282388150691986, "step": 1309 }, { @@ -37976,27 +37976,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1767.0, - "completions/mean_length": 767.103515625, - "completions/mean_terminated_length": 706.8568115234375, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1909.0, + "completions/max_terminated_length": 1909.0, + "completions/mean_length": 701.01953125, + "completions/mean_terminated_length": 701.01953125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.4472134505419476, - "grad_norm": 2.090247392654419, - "kl": 7.59375, - "learning_rate": 7.081832019344573e-07, - "loss": 0.4598, - "num_tokens": 747558638.0, - "reward": 1.74658203125, - "reward_std": 0.5532182455062866, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.80078125, - "rewards/format_reward/std": 0.39980348944664, - "rewards/tag_count_reward/mean": 0.90283203125, - "rewards/tag_count_reward/std": 0.21491996943950653, + "grad_norm": 3.995776653289795, + "kl": 2.8359375, + "learning_rate": 7.084916073290223e-07, + "loss": 0.1289, + "num_tokens": 818698620.0, + "reward": 0.94921875, + "reward_std": 0.21329271793365479, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.1939331740140915, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.20508308708667755, "step": 1310 }, { @@ -38005,27 +38005,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 916.376953125, - "completions/mean_terminated_length": 838.4154663085938, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 839.27734375, + "completions/mean_terminated_length": 836.9119262695312, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.4475548348553384, - "grad_norm": 2.8132505416870117, - "kl": 8.328125, - "learning_rate": 7.076811956892241e-07, - "loss": 0.4943, - "num_tokens": 748113839.0, - "reward": 1.6796875, - "reward_std": 0.6440863609313965, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.7421875, - "rewards/format_reward/std": 0.43785804510116577, - "rewards/tag_count_reward/mean": 0.87890625, - "rewards/tag_count_reward/std": 0.235612154006958, + "grad_norm": 3.408872127532959, + "kl": 3.2265625, + "learning_rate": 7.079895485445694e-07, + "loss": 0.1588, + "num_tokens": 819214346.0, + "reward": 1.0205078125, + "reward_std": 0.2695790231227875, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.18026389181613922, "step": 1311 }, { @@ -38034,27 +38034,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 786.80078125, - "completions/mean_terminated_length": 746.116943359375, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1580.0, + "completions/max_terminated_length": 1580.0, + "completions/mean_length": 736.931640625, + "completions/mean_terminated_length": 736.931640625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.4478962191687292, - "grad_norm": 1.1374046802520752, - "kl": 5.7890625, - "learning_rate": 7.07178965644374e-07, - "loss": 0.3638, - "num_tokens": 748599785.0, - "reward": 1.85693359375, - "reward_std": 0.5871137380599976, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.1861388236284256, + "grad_norm": 1.6791059970855713, + "kl": 2.33984375, + "learning_rate": 7.07487265352682e-07, + "loss": 0.1049, + "num_tokens": 819674759.0, + "reward": 1.095703125, + "reward_std": 0.2654426097869873, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.15873152017593384, "step": 1312 }, { @@ -38063,27 +38063,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 726.25, - "completions/mean_terminated_length": 683.6128540039062, - "completions/min_length": 42.0, - "completions/min_terminated_length": 42.0, + "completions/max_terminated_length": 1621.0, + "completions/mean_length": 681.40625, + "completions/mean_terminated_length": 676.047119140625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.44823760348212, - "grad_norm": 2.2175955772399902, - "kl": 5.7265625, - "learning_rate": 7.066765125127305e-07, - "loss": 0.3893, - "num_tokens": 749043593.0, - "reward": 1.81494140625, - "reward_std": 0.5199028253555298, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.1874433010816574, + "grad_norm": 2.032522201538086, + "kl": 3.36328125, + "learning_rate": 7.069847584668009e-07, + "loss": 0.1702, + "num_tokens": 820095607.0, + "reward": 0.97998046875, + "reward_std": 0.25572580099105835, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.1914980411529541, "step": 1313 }, { @@ -38092,27 +38092,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1924.0, - "completions/mean_length": 733.255859375, - "completions/mean_terminated_length": 707.0657348632812, - "completions/min_length": 49.0, - "completions/min_terminated_length": 49.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1733.0, + "completions/max_terminated_length": 1733.0, + "completions/mean_length": 704.08984375, + "completions/mean_terminated_length": 704.08984375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.4485789877955108, - "grad_norm": 3.9274680614471436, - "kl": 4.7578125, - "learning_rate": 7.061738370074342e-07, - "loss": 0.3534, - "num_tokens": 749498892.0, - "reward": 1.88720703125, - "reward_std": 0.5058292150497437, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.16893739998340607, + "grad_norm": 2.2327351570129395, + "kl": 2.8125, + "learning_rate": 7.064820286006832e-07, + "loss": 0.134, + "num_tokens": 820535973.0, + "reward": 1.04443359375, + "reward_std": 0.27658912539482117, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.17807930707931519, "step": 1314 }, { @@ -38121,27 +38121,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1890.0, - "completions/mean_length": 842.671875, - "completions/mean_terminated_length": 767.6514892578125, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 757.623046875, + "completions/mean_terminated_length": 750.0177001953125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.4489203721089016, - "grad_norm": 1.1414005756378174, - "kl": 8.15625, - "learning_rate": 7.056709398419407e-07, - "loss": 0.5243, - "num_tokens": 750019828.0, - "reward": 1.69921875, - "reward_std": 0.6134747266769409, - "rewards/accuracy_reward/mean": 0.0234375, - "rewards/accuracy_reward/std": 0.15143637359142303, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.890625, - "rewards/tag_count_reward/std": 0.23355920612812042, + "grad_norm": 3.413795232772827, + "kl": 3.765625, + "learning_rate": 7.059790764684039e-07, + "loss": 0.1952, + "num_tokens": 821013364.0, + "reward": 0.94970703125, + "reward_std": 0.2431173026561737, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17416280508041382, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.19919723272323608, "step": 1315 }, { @@ -38150,27 +38150,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 797.68359375, - "completions/mean_terminated_length": 744.207763671875, - "completions/min_length": 11.0, - "completions/min_terminated_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1646.0, + "completions/max_terminated_length": 1646.0, + "completions/mean_length": 732.56640625, + "completions/mean_terminated_length": 732.56640625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.4492617564222924, - "grad_norm": 1.8736921548843384, - "kl": 6.4765625, - "learning_rate": 7.051678217300207e-07, - "loss": 0.3547, - "num_tokens": 750511106.0, - "reward": 1.78125, - "reward_std": 0.60211181640625, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.80078125, - "rewards/format_reward/std": 0.39980348944664, - "rewards/tag_count_reward/mean": 0.90234375, - "rewards/tag_count_reward/std": 0.21892902255058289, + "grad_norm": 2.764852523803711, + "kl": 2.56640625, + "learning_rate": 7.054759027843532e-07, + "loss": 0.1017, + "num_tokens": 821471302.0, + "reward": 1.0546875, + "reward_std": 0.2671966254711151, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.17270830273628235, "step": 1316 }, { @@ -38179,27 +38179,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1933.0, - "completions/mean_length": 793.58203125, - "completions/mean_terminated_length": 734.5807495117188, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1634.0, + "completions/max_terminated_length": 1634.0, + "completions/mean_length": 735.04296875, + "completions/mean_terminated_length": 735.04296875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.4496031407356832, - "grad_norm": 1.5235521793365479, - "kl": 7.015625, - "learning_rate": 7.046644833857583e-07, - "loss": 0.4618, - "num_tokens": 750989820.0, - "reward": 1.791015625, - "reward_std": 0.5913950800895691, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.904296875, - "rewards/tag_count_reward/std": 0.21642689406871796, + "grad_norm": 3.533684730529785, + "kl": 3.29296875, + "learning_rate": 7.049725082632362e-07, + "loss": 0.1847, + "num_tokens": 821920044.0, + "reward": 1.02490234375, + "reward_std": 0.2679470181465149, + "rewards/accuracy_reward/mean": 0.10685484111309052, + "rewards/accuracy_reward/std": 0.30924052000045776, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.19085827469825745, "step": 1317 }, { @@ -38208,27 +38208,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 823.259765625, - "completions/mean_terminated_length": 770.8778686523438, - "completions/min_length": 218.0, - "completions/min_terminated_length": 218.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1744.0, + "completions/max_terminated_length": 1744.0, + "completions/mean_length": 743.291015625, + "completions/mean_terminated_length": 743.291015625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.449944525049074, - "grad_norm": 1.2474850416183472, - "kl": 6.9609375, - "learning_rate": 7.041609255235503e-07, - "loss": 0.4429, - "num_tokens": 751484241.0, - "reward": 1.783203125, - "reward_std": 0.517959713935852, + "grad_norm": 1.990745186805725, + "kl": 2.34375, + "learning_rate": 7.044688936200712e-07, + "loss": 0.1033, + "num_tokens": 822373521.0, + "reward": 0.962890625, + "reward_std": 0.2034437209367752, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.2045886218547821, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.17199864983558655, "step": 1318 }, { @@ -38237,27 +38237,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 778.888671875, - "completions/mean_terminated_length": 735.3030395507812, - "completions/min_length": 66.0, - "completions/min_terminated_length": 66.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1818.0, + "completions/max_terminated_length": 1818.0, + "completions/mean_length": 746.63671875, + "completions/mean_terminated_length": 746.63671875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.4502859093624648, - "grad_norm": 1.5461757183074951, - "kl": 6.10546875, - "learning_rate": 7.036571488581049e-07, - "loss": 0.3639, - "num_tokens": 751962616.0, - "reward": 1.76123046875, - "reward_std": 0.5829464197158813, - "rewards/accuracy_reward/mean": 0.04233871027827263, - "rewards/accuracy_reward/std": 0.2015640139579773, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.21334922313690186, + "grad_norm": 2.4520978927612305, + "kl": 3.078125, + "learning_rate": 7.039650595701898e-07, + "loss": 0.1526, + "num_tokens": 822835383.0, + "reward": 0.97705078125, + "reward_std": 0.26465874910354614, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24230584502220154, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.19718192517757416, "step": 1319 }, { @@ -38266,27 +38266,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 818.220703125, - "completions/mean_terminated_length": 757.7396850585938, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1428.0, + "completions/max_terminated_length": 1428.0, + "completions/mean_length": 699.361328125, + "completions/mean_terminated_length": 699.361328125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.4506272936758556, - "grad_norm": 1.982473611831665, - "kl": 7.2734375, - "learning_rate": 7.031531541044411e-07, - "loss": 0.4438, - "num_tokens": 752449673.0, - "reward": 1.80126953125, - "reward_std": 0.5766713619232178, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.22056347131729126, + "grad_norm": 2.0763678550720215, + "kl": 2.474609375, + "learning_rate": 7.034610068292349e-07, + "loss": 0.1058, + "num_tokens": 823261584.0, + "reward": 0.98046875, + "reward_std": 0.24530547857284546, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.1875101923942566, "step": 1320 }, { @@ -38294,28 +38294,28 @@ "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, - "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 798.314453125, - "completions/mean_terminated_length": 723.2816162109375, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1751.0, + "completions/max_terminated_length": 1751.0, + "completions/mean_length": 670.626953125, + "completions/mean_terminated_length": 670.626953125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.4509686779892464, - "grad_norm": 1.7228455543518066, - "kl": 7.1796875, - "learning_rate": 7.026489419778871e-07, - "loss": 0.5007, - "num_tokens": 752939882.0, - "reward": 1.79052734375, - "reward_std": 0.611250102519989, - "rewards/accuracy_reward/mean": 0.08669354766607285, - "rewards/accuracy_reward/std": 0.281669557094574, - "rewards/format_reward/mean": 0.8046875, - "rewards/format_reward/std": 0.3968288004398346, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.22504940629005432, + "grad_norm": 2.817829132080078, + "kl": 3.06640625, + "learning_rate": 7.029567361131601e-07, + "loss": 0.1736, + "num_tokens": 823686417.0, + "reward": 1.03955078125, + "reward_std": 0.28581804037094116, + "rewards/accuracy_reward/mean": 0.11693548411130905, + "rewards/accuracy_reward/std": 0.32166779041290283, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.18783541023731232, "step": 1321 }, { @@ -38324,27 +38324,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 867.53515625, - "completions/mean_terminated_length": 786.2088012695312, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1890.0, + "completions/max_terminated_length": 1890.0, + "completions/mean_length": 724.439453125, + "completions/mean_terminated_length": 724.439453125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.4513100623026372, - "grad_norm": 2.166879653930664, - "kl": 6.7265625, - "learning_rate": 7.021445131940797e-07, - "loss": 0.4772, - "num_tokens": 753456572.0, - "reward": 1.77001953125, - "reward_std": 0.587213933467865, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.21208246052265167, + "grad_norm": 3.7140069007873535, + "kl": 4.0390625, + "learning_rate": 7.024522481382284e-07, + "loss": 0.2222, + "num_tokens": 824129842.0, + "reward": 0.93701171875, + "reward_std": 0.2504616379737854, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.88427734375, + "rewards/tag_count_reward/std": 0.23126865923404694, "step": 1322 }, { @@ -38353,27 +38353,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 824.8515625, - "completions/mean_terminated_length": 772.5377197265625, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/max_terminated_length": 1598.0, + "completions/mean_length": 697.712890625, + "completions/mean_terminated_length": 695.0704345703125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, "epoch": 0.451651446616028, - "grad_norm": 2.0856263637542725, - "kl": 7.90625, - "learning_rate": 7.016398684689636e-07, - "loss": 0.5179, - "num_tokens": 753964992.0, - "reward": 1.76171875, - "reward_std": 0.613147497177124, + "grad_norm": 3.1952767372131348, + "kl": 3.453125, + "learning_rate": 7.019475436210118e-07, + "loss": 0.1988, + "num_tokens": 824573167.0, + "reward": 0.98193359375, + "reward_std": 0.29698917269706726, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.896484375, - "rewards/tag_count_reward/std": 0.21564844250679016, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.89599609375, + "rewards/tag_count_reward/std": 0.22129441797733307, "step": 1323 }, { @@ -38382,27 +38382,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 780.615234375, - "completions/mean_terminated_length": 734.4352416992188, - "completions/min_length": 56.0, - "completions/min_terminated_length": 56.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1451.0, + "completions/max_terminated_length": 1451.0, + "completions/mean_length": 675.330078125, + "completions/mean_terminated_length": 675.330078125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.4519928309294188, - "grad_norm": 1.2203400135040283, - "kl": 6.7265625, - "learning_rate": 7.011350085187895e-07, - "loss": 0.4204, - "num_tokens": 754435643.0, - "reward": 1.77294921875, - "reward_std": 0.6677350997924805, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.89013671875, - "rewards/tag_count_reward/std": 0.23515698313713074, + "grad_norm": 2.8782715797424316, + "kl": 3.19140625, + "learning_rate": 7.014426232783896e-07, + "loss": 0.1879, + "num_tokens": 824989912.0, + "reward": 1.072265625, + "reward_std": 0.3156305253505707, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.19968174397945404, "step": 1324 }, { @@ -38411,27 +38411,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 804.119140625, - "completions/mean_terminated_length": 745.6134643554688, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1996.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 707.73046875, + "completions/mean_terminated_length": 707.73046875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.4523342152428096, - "grad_norm": 2.034092903137207, - "kl": 6.734375, - "learning_rate": 7.006299340601136e-07, - "loss": 0.4632, - "num_tokens": 754927608.0, - "reward": 1.78076171875, - "reward_std": 0.5564095377922058, - "rewards/accuracy_reward/mean": 0.04233871027827263, - "rewards/accuracy_reward/std": 0.2015640139579773, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.20743593573570251, + "grad_norm": 3.448002815246582, + "kl": 3.0625, + "learning_rate": 7.009374878275476e-07, + "loss": 0.1784, + "num_tokens": 825432526.0, + "reward": 0.93994140625, + "reward_std": 0.24009019136428833, + "rewards/accuracy_reward/mean": 0.032258063554763794, + "rewards/accuracy_reward/std": 0.17686307430267334, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.2088819146156311, "step": 1325 }, { @@ -38440,27 +38440,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 747.501953125, - "completions/mean_terminated_length": 694.6361694335938, - "completions/min_length": 15.0, - "completions/min_terminated_length": 15.0, + "completions/max_terminated_length": 1852.0, + "completions/mean_length": 652.580078125, + "completions/mean_terminated_length": 649.8493041992188, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, "epoch": 0.4526755995562004, - "grad_norm": 1.7306252717971802, - "kl": 7.0546875, - "learning_rate": 7.001246458097972e-07, - "loss": 0.4403, - "num_tokens": 755387049.0, - "reward": 1.87548828125, - "reward_std": 0.5507863163948059, - "rewards/accuracy_reward/mean": 0.11895161122083664, - "rewards/accuracy_reward/std": 0.3240584135055542, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.20804768800735474, + "grad_norm": 5.512711524963379, + "kl": 2.2734375, + "learning_rate": 7.004321379859774e-07, + "loss": 0.1281, + "num_tokens": 825843367.0, + "reward": 1.07666015625, + "reward_std": 0.23804612457752228, + "rewards/accuracy_reward/mean": 0.14516128599643707, + "rewards/accuracy_reward/std": 0.3526190221309662, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.1704016625881195, "step": 1326 }, { @@ -38469,27 +38469,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1858.0, - "completions/mean_length": 798.201171875, - "completions/mean_terminated_length": 750.0344848632812, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1821.0, + "completions/max_terminated_length": 1821.0, + "completions/mean_length": 711.171875, + "completions/mean_terminated_length": 711.171875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.4530169838695912, - "grad_norm": 1.4104769229888916, - "kl": 6.40625, - "learning_rate": 6.99619144485004e-07, - "loss": 0.4229, - "num_tokens": 755873392.0, - "reward": 1.8349609375, - "reward_std": 0.5700967907905579, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.19929614663124084, + "grad_norm": 2.214639186859131, + "kl": 3.3671875, + "learning_rate": 6.999265744714747e-07, + "loss": 0.1561, + "num_tokens": 826285151.0, + "reward": 1.02587890625, + "reward_std": 0.26892101764678955, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.91064453125, + "rewards/tag_count_reward/std": 0.1964682638645172, "step": 1327 }, { @@ -38498,27 +38498,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1963.0, - "completions/mean_length": 790.73828125, - "completions/mean_terminated_length": 747.5596313476562, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1696.0, + "completions/max_terminated_length": 1696.0, + "completions/mean_length": 716.1796875, + "completions/mean_terminated_length": 716.1796875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, "epoch": 0.453358368182982, - "grad_norm": 3.7157509326934814, - "kl": 4.65625, - "learning_rate": 6.99113430803201e-07, - "loss": 0.3258, - "num_tokens": 756362346.0, - "reward": 1.84326171875, - "reward_std": 0.5458806753158569, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.19246363639831543, + "grad_norm": 3.9541826248168945, + "kl": 2.466796875, + "learning_rate": 6.994207980021394e-07, + "loss": 0.1332, + "num_tokens": 826735931.0, + "reward": 1.0224609375, + "reward_std": 0.25644171237945557, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.1798180788755417, "step": 1328 }, { @@ -38527,27 +38527,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1955.0, - "completions/mean_length": 839.15625, - "completions/mean_terminated_length": 807.663330078125, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 1726.0, + "completions/mean_length": 750.740234375, + "completions/mean_terminated_length": 745.6530151367188, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.4536997524963728, - "grad_norm": 1.4480165243148804, - "kl": 6.3984375, - "learning_rate": 6.986075054821561e-07, - "loss": 0.3843, - "num_tokens": 756869130.0, - "reward": 1.783203125, - "reward_std": 0.5669708251953125, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.19138385355472565, + "grad_norm": 42.32180404663086, + "kl": 3.5, + "learning_rate": 6.989148092963732e-07, + "loss": 0.1655, + "num_tokens": 827197446.0, + "reward": 0.96826171875, + "reward_std": 0.27005231380462646, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90185546875, + "rewards/tag_count_reward/std": 0.21390387415885925, "step": 1329 }, { @@ -38556,27 +38556,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 827.486328125, - "completions/mean_terminated_length": 754.2049560546875, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1514.0, + "completions/max_terminated_length": 1514.0, + "completions/mean_length": 705.81640625, + "completions/mean_terminated_length": 705.81640625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.4540411368097636, - "grad_norm": 2.614987373352051, - "kl": 9.59375, - "learning_rate": 6.98101369239938e-07, - "loss": 0.6153, - "num_tokens": 757356915.0, - "reward": 1.80810546875, - "reward_std": 0.5938037633895874, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.90576171875, - "rewards/tag_count_reward/std": 0.21903319656848907, + "grad_norm": 3.038391590118408, + "kl": 4.1796875, + "learning_rate": 6.984086090728795e-07, + "loss": 0.215, + "num_tokens": 827622936.0, + "reward": 0.9990234375, + "reward_std": 0.28966623544692993, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8955078125, + "rewards/tag_count_reward/std": 0.2099979966878891, "step": 1330 }, { @@ -38585,27 +38585,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 817.677734375, - "completions/mean_terminated_length": 751.8580322265625, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1693.0, + "completions/max_terminated_length": 1693.0, + "completions/mean_length": 709.224609375, + "completions/mean_terminated_length": 709.224609375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.4543825211231544, - "grad_norm": 1.9934707880020142, - "kl": 8.5703125, - "learning_rate": 6.975950227949143e-07, - "loss": 0.5526, - "num_tokens": 757857726.0, - "reward": 1.82568359375, - "reward_std": 0.619998037815094, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.22286489605903625, + "grad_norm": 2.636596202850342, + "kl": 3.8671875, + "learning_rate": 6.979021980506619e-07, + "loss": 0.2154, + "num_tokens": 828068219.0, + "reward": 1.017578125, + "reward_std": 0.3015235662460327, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.21154166758060455, "step": 1331 }, { @@ -38614,27 +38614,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1869.0, - "completions/mean_length": 841.33203125, - "completions/mean_terminated_length": 789.7230834960938, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/max_terminated_length": 1906.0, + "completions/mean_length": 740.01953125, + "completions/mean_terminated_length": 734.8902587890625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.4547239054365452, - "grad_norm": 1.104121446609497, - "kl": 8.2109375, - "learning_rate": 6.970884668657512e-07, - "loss": 0.5149, - "num_tokens": 758371464.0, - "reward": 1.76904296875, - "reward_std": 0.540198564529419, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.90380859375, - "rewards/tag_count_reward/std": 0.22316910326480865, + "grad_norm": 2.719158172607422, + "kl": 4.546875, + "learning_rate": 6.973955769490243e-07, + "loss": 0.248, + "num_tokens": 828530085.0, + "reward": 0.91748046875, + "reward_std": 0.25036895275115967, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.1843547374010086, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88232421875, + "rewards/tag_count_reward/std": 0.22271907329559326, "step": 1332 }, { @@ -38643,27 +38643,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 844.0859375, - "completions/mean_terminated_length": 782.2833862304688, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2013.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 725.271484375, + "completions/mean_terminated_length": 725.271484375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, "epoch": 0.455065289749936, - "grad_norm": 3.675091028213501, - "kl": 8.7109375, - "learning_rate": 6.965817021714124e-07, - "loss": 0.5246, - "num_tokens": 758886868.0, - "reward": 1.77783203125, - "reward_std": 0.5871965885162354, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.90478515625, - "rewards/tag_count_reward/std": 0.21972574293613434, + "grad_norm": 2.873018980026245, + "kl": 5.046875, + "learning_rate": 6.96888746487568e-07, + "loss": 0.2991, + "num_tokens": 828984656.0, + "reward": 0.9521484375, + "reward_std": 0.30791783332824707, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.8798828125, + "rewards/tag_count_reward/std": 0.23817551136016846, "step": 1333 }, { @@ -38672,27 +38672,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 780.27734375, - "completions/mean_terminated_length": 747.25048828125, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1739.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 699.87890625, + "completions/mean_terminated_length": 699.87890625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, "epoch": 0.4554066740633268, - "grad_norm": 1.0809215307235718, - "kl": 5.0703125, - "learning_rate": 6.960747294311575e-07, - "loss": 0.3025, - "num_tokens": 759361826.0, - "reward": 1.8271484375, - "reward_std": 0.5094544887542725, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.1954032927751541, + "grad_norm": 4.217231750488281, + "kl": 4.0390625, + "learning_rate": 6.963817073861918e-07, + "loss": 0.2136, + "num_tokens": 829418450.0, + "reward": 0.97802734375, + "reward_std": 0.28467512130737305, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.89208984375, + "rewards/tag_count_reward/std": 0.21604189276695251, "step": 1334 }, { @@ -38701,27 +38701,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 824.82421875, - "completions/mean_terminated_length": 759.3867797851562, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_terminated_length": 1819.0, + "completions/mean_length": 712.607421875, + "completions/mean_terminated_length": 696.7727661132812, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, "epoch": 0.4557480583767176, - "grad_norm": 3.4591548442840576, - "kl": 6.4375, - "learning_rate": 6.955675493645415e-07, - "loss": 0.476, - "num_tokens": 759856440.0, - "reward": 1.8408203125, - "reward_std": 0.5953292846679688, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.19973675906658173, + "grad_norm": 2.996673345565796, + "kl": 4.46484375, + "learning_rate": 6.958744603650916e-07, + "loss": 0.2481, + "num_tokens": 829855609.0, + "reward": 1.0, + "reward_std": 0.2858024835586548, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.896484375, + "rewards/tag_count_reward/std": 0.2206939458847046, "step": 1335 }, { @@ -38730,27 +38730,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 827.73046875, - "completions/mean_terminated_length": 788.3668823242188, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1769.0, + "completions/max_terminated_length": 1769.0, + "completions/mean_length": 720.28125, + "completions/mean_terminated_length": 720.28125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, "epoch": 0.45608944269010837, - "grad_norm": 3.3444979190826416, - "kl": 5.34375, - "learning_rate": 6.950601626914139e-07, - "loss": 0.3554, - "num_tokens": 760362846.0, - "reward": 1.83544921875, - "reward_std": 0.5050147771835327, - "rewards/accuracy_reward/mean": 0.058467742055654526, - "rewards/accuracy_reward/std": 0.23486268520355225, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19710436463356018, + "grad_norm": 1.891392469406128, + "kl": 4.13671875, + "learning_rate": 6.953670061447576e-07, + "loss": 0.2217, + "num_tokens": 830307001.0, + "reward": 0.978515625, + "reward_std": 0.2753485441207886, + "rewards/accuracy_reward/mean": 0.08467742055654526, + "rewards/accuracy_reward/std": 0.278682142496109, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.896484375, + "rewards/tag_count_reward/std": 0.21336770057678223, "step": 1336 }, { @@ -38759,27 +38759,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 823.904296875, - "completions/mean_terminated_length": 761.0657348632812, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1810.0, + "completions/max_terminated_length": 1810.0, + "completions/mean_length": 705.3671875, + "completions/mean_terminated_length": 705.3671875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.4564308270034992, - "grad_norm": 4.18440580368042, - "kl": 5.53125, - "learning_rate": 6.94552570131917e-07, - "loss": 0.4259, - "num_tokens": 760864061.0, - "reward": 1.85498046875, - "reward_std": 0.5451841354370117, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.20040719211101532, + "grad_norm": 3.3166732788085938, + "kl": 3.7265625, + "learning_rate": 6.948593454459752e-07, + "loss": 0.2075, + "num_tokens": 830747525.0, + "reward": 1.00927734375, + "reward_std": 0.30435097217559814, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.2084423005580902, "step": 1337 }, { @@ -38788,27 +38788,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 818.67578125, - "completions/mean_terminated_length": 781.5734252929688, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 734.3203125, + "completions/mean_terminated_length": 731.74951171875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.45677221131689, - "grad_norm": 2.1832258701324463, - "kl": 5.09375, - "learning_rate": 6.940447724064861e-07, - "loss": 0.3425, - "num_tokens": 761359943.0, - "reward": 1.9228515625, - "reward_std": 0.5376400947570801, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.18668025732040405, + "grad_norm": 1.527887225151062, + "kl": 3.1953125, + "learning_rate": 6.943514789898224e-07, + "loss": 0.1564, + "num_tokens": 831200217.0, + "reward": 1.02880859375, + "reward_std": 0.3165457248687744, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.20045962929725647, "step": 1338 }, { @@ -38817,27 +38817,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 800.65234375, - "completions/mean_terminated_length": 757.8141479492188, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1434.0, + "completions/max_terminated_length": 1434.0, + "completions/mean_length": 688.16015625, + "completions/mean_terminated_length": 688.16015625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.4571135956302808, - "grad_norm": 1.333736538887024, - "kl": 6.3125, - "learning_rate": 6.935367702358469e-07, - "loss": 0.398, - "num_tokens": 761838021.0, - "reward": 1.90234375, - "reward_std": 0.5752283334732056, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.2035866528749466, + "grad_norm": 2.426649570465088, + "kl": 2.5625, + "learning_rate": 6.938434074976701e-07, + "loss": 0.1023, + "num_tokens": 831620699.0, + "reward": 1.07666015625, + "reward_std": 0.3229554295539856, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.18523313105106354, "step": 1339 }, { @@ -38846,27 +38846,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1938.0, - "completions/mean_length": 755.560546875, - "completions/mean_terminated_length": 694.7709350585938, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1962.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 666.642578125, + "completions/mean_terminated_length": 666.642578125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.45745497994367157, - "grad_norm": 1.5195820331573486, - "kl": 7.4921875, - "learning_rate": 6.930285643410154e-07, - "loss": 0.4774, - "num_tokens": 762306980.0, - "reward": 1.8505859375, - "reward_std": 0.5642558932304382, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.21908392012119293, + "grad_norm": 5.0506086349487305, + "kl": 4.5859375, + "learning_rate": 6.933351316911798e-07, + "loss": 0.2509, + "num_tokens": 832044132.0, + "reward": 0.98486328125, + "reward_std": 0.32877469062805176, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.88134765625, + "rewards/tag_count_reward/std": 0.23659509420394897, "step": 1340 }, { @@ -38875,27 +38875,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 766.0390625, - "completions/mean_terminated_length": 719.3279418945312, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/max_terminated_length": 1730.0, + "completions/mean_length": 707.759765625, + "completions/mean_terminated_length": 702.5039672851562, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.4577963642570624, - "grad_norm": 1.7313032150268555, - "kl": 6.7265625, - "learning_rate": 6.925201554432972e-07, - "loss": 0.4252, - "num_tokens": 762779560.0, - "reward": 1.830078125, - "reward_std": 0.4659072756767273, - "rewards/accuracy_reward/mean": 0.02916666679084301, - "rewards/accuracy_reward/std": 0.1684490591287613, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.19400213658809662, + "grad_norm": 2.1826882362365723, + "kl": 3.7109375, + "learning_rate": 6.928266522923035e-07, + "loss": 0.1971, + "num_tokens": 832486873.0, + "reward": 0.958984375, + "reward_std": 0.21873745322227478, + "rewards/accuracy_reward/mean": 0.04583333432674408, + "rewards/accuracy_reward/std": 0.20934167504310608, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.20051266252994537, "step": 1341 }, { @@ -38904,27 +38904,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1913.0, - "completions/mean_length": 855.6640625, - "completions/mean_terminated_length": 804.6680908203125, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1947.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 769.08203125, + "completions/mean_terminated_length": 769.08203125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.4581377485704532, - "grad_norm": 0.7581866979598999, - "kl": 6.7890625, - "learning_rate": 6.920115442642858e-07, - "loss": 0.4448, - "num_tokens": 763302764.0, - "reward": 1.828125, - "reward_std": 0.583230197429657, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.20074127614498138, + "grad_norm": 1.371950626373291, + "kl": 2.921875, + "learning_rate": 6.923179700232826e-07, + "loss": 0.1335, + "num_tokens": 832965747.0, + "reward": 1.03466796875, + "reward_std": 0.2636609673500061, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.19194161891937256, "step": 1342 }, { @@ -38933,27 +38933,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1976.0, - "completions/mean_length": 834.03125, - "completions/mean_terminated_length": 782.1100463867188, - "completions/min_length": 73.0, - "completions/min_terminated_length": 73.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 754.505859375, + "completions/mean_terminated_length": 751.9745483398438, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, "epoch": 0.458479132883844, - "grad_norm": 1.2648884057998657, - "kl": 8.03125, - "learning_rate": 6.915027315258614e-07, - "loss": 0.5089, - "num_tokens": 763802876.0, - "reward": 1.81884765625, - "reward_std": 0.5947195291519165, - "rewards/accuracy_reward/mean": 0.07661290466785431, - "rewards/accuracy_reward/std": 0.2662447690963745, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.21285150945186615, + "grad_norm": 3.490316390991211, + "kl": 3.93359375, + "learning_rate": 6.918090856066463e-07, + "loss": 0.1997, + "num_tokens": 833425142.0, + "reward": 0.96728515625, + "reward_std": 0.3163343071937561, + "rewards/accuracy_reward/mean": 0.08467742055654526, + "rewards/accuracy_reward/std": 0.278682142496109, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.88134765625, + "rewards/tag_count_reward/std": 0.23607757687568665, "step": 1343 }, { @@ -38962,27 +38962,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 783.857421875, - "completions/mean_terminated_length": 718.9630737304688, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1729.0, + "completions/max_terminated_length": 1729.0, + "completions/mean_length": 724.1484375, + "completions/mean_terminated_length": 724.1484375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, "epoch": 0.45882051719723477, - "grad_norm": 1.795419692993164, - "kl": 7.671875, - "learning_rate": 6.909937179501908e-07, - "loss": 0.543, - "num_tokens": 764281123.0, - "reward": 1.830078125, - "reward_std": 0.5486209988594055, - "rewards/accuracy_reward/mean": 0.052419353276491165, - "rewards/accuracy_reward/std": 0.22309619188308716, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.20286257565021515, + "grad_norm": 2.284849166870117, + "kl": 3.15234375, + "learning_rate": 6.91299999765211e-07, + "loss": 0.1325, + "num_tokens": 833872818.0, + "reward": 0.96142578125, + "reward_std": 0.26088348031044006, + "rewards/accuracy_reward/mean": 0.060483869165182114, + "rewards/accuracy_reward/std": 0.2386218160390854, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.89892578125, + "rewards/tag_count_reward/std": 0.21708759665489197, "step": 1344 }, { @@ -38991,27 +38991,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 782.736328125, - "completions/mean_terminated_length": 741.9213256835938, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2044.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 726.76953125, + "completions/mean_terminated_length": 726.76953125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.4591619015106256, - "grad_norm": 2.3222286701202393, - "kl": 7.6171875, - "learning_rate": 6.904845042597258e-07, - "loss": 0.4465, - "num_tokens": 764752572.0, - "reward": 1.82080078125, - "reward_std": 0.5536209344863892, - "rewards/accuracy_reward/mean": 0.08064515888690948, - "rewards/accuracy_reward/std": 0.2725643217563629, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.20901454985141754, + "grad_norm": 3.049931287765503, + "kl": 4.2109375, + "learning_rate": 6.907907132220794e-07, + "loss": 0.2517, + "num_tokens": 834315612.0, + "reward": 0.9921875, + "reward_std": 0.25062692165374756, + "rewards/accuracy_reward/mean": 0.09677419066429138, + "rewards/accuracy_reward/std": 0.2959485352039337, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.21316158771514893, "step": 1345 }, { @@ -39020,27 +39020,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 854.87109375, - "completions/mean_terminated_length": 793.6221923828125, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2044.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 760.76171875, + "completions/mean_terminated_length": 760.76171875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, "epoch": 0.4595032858240164, - "grad_norm": 1.380971908569336, - "kl": 6.8828125, - "learning_rate": 6.899750911772019e-07, - "loss": 0.4723, - "num_tokens": 765262410.0, - "reward": 1.8125, - "reward_std": 0.48659664392471313, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.20005464553833008, + "grad_norm": 1.9266505241394043, + "kl": 3.79296875, + "learning_rate": 6.902812267006389e-07, + "loss": 0.1849, + "num_tokens": 834777266.0, + "reward": 0.9560546875, + "reward_std": 0.2592583894729614, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.8994140625, + "rewards/tag_count_reward/std": 0.21132247149944305, "step": 1346 }, { @@ -39049,27 +39049,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 815.16796875, - "completions/mean_terminated_length": 767.6551513671875, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, - "epoch": 0.4598446701374072, - "grad_norm": 1.618891954421997, - "kl": 6.0859375, - "learning_rate": 6.894654794256378e-07, - "loss": 0.4128, - "num_tokens": 765767392.0, - "reward": 1.84375, - "reward_std": 0.5409480333328247, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.2005603015422821, + "completions/max_terminated_length": 1826.0, + "completions/mean_length": 736.369140625, + "completions/mean_terminated_length": 728.6385498046875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.4598446701374072, + "grad_norm": 2.7474708557128906, + "kl": 3.765625, + "learning_rate": 6.897715409245615e-07, + "loss": 0.2104, + "num_tokens": 835241903.0, + "reward": 1.01123046875, + "reward_std": 0.2824721336364746, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.90576171875, + "rewards/tag_count_reward/std": 0.2145194560289383, "step": 1347 }, { @@ -39078,27 +39078,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1762.0, - "completions/mean_length": 778.259765625, - "completions/mean_terminated_length": 737.3003540039062, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/max_terminated_length": 1666.0, + "completions/mean_length": 725.9296875, + "completions/mean_terminated_length": 712.8915405273438, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.46018605445079797, - "grad_norm": 2.0402615070343018, - "kl": 5.5625, - "learning_rate": 6.889556697283344e-07, - "loss": 0.3458, - "num_tokens": 766239573.0, - "reward": 1.8759765625, - "reward_std": 0.45865148305892944, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.18138417601585388, + "grad_norm": 2.6644017696380615, + "kl": 4.31640625, + "learning_rate": 6.892616566178017e-07, + "loss": 0.2167, + "num_tokens": 835687291.0, + "reward": 0.96435546875, + "reward_std": 0.263368159532547, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.20309264957904816, "step": 1348 }, { @@ -39107,27 +39107,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1892.0, - "completions/mean_length": 784.96484375, - "completions/mean_terminated_length": 717.39501953125, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/max_terminated_length": 1769.0, + "completions/mean_length": 725.02734375, + "completions/mean_terminated_length": 722.4383544921875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.4605274387641888, - "grad_norm": 1.3208638429641724, - "kl": 7.9921875, - "learning_rate": 6.88445662808873e-07, - "loss": 0.5725, - "num_tokens": 766722515.0, - "reward": 1.84619140625, - "reward_std": 0.5656688213348389, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.19371071457862854, + "grad_norm": 1.93429696559906, + "kl": 2.953125, + "learning_rate": 6.887515745045963e-07, + "loss": 0.1259, + "num_tokens": 836139545.0, + "reward": 0.98193359375, + "reward_std": 0.25790125131607056, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.1952926218509674, "step": 1349 }, { @@ -39136,27 +39136,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 853.98046875, - "completions/mean_terminated_length": 777.0270385742188, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 777.263671875, + "completions/mean_terminated_length": 774.7769165039062, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.4608688230775796, - "grad_norm": 2.3935322761535645, - "kl": 9.0, - "learning_rate": 6.879354593911154e-07, - "loss": 0.6046, - "num_tokens": 767239801.0, - "reward": 1.74853515625, - "reward_std": 0.5295427441596985, - "rewards/accuracy_reward/mean": 0.0078125, - "rewards/accuracy_reward/std": 0.08812850713729858, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.20804768800735474, + "grad_norm": 1.512467384338379, + "kl": 3.6640625, + "learning_rate": 6.882412953094629e-07, + "loss": 0.1604, + "num_tokens": 836617552.0, + "reward": 0.93017578125, + "reward_std": 0.2594614624977112, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17416280508041382, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.89501953125, + "rewards/tag_count_reward/std": 0.21748337149620056, "step": 1350 }, { @@ -39165,27 +39165,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 818.095703125, - "completions/mean_terminated_length": 754.958984375, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1827.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 743.033203125, + "completions/mean_terminated_length": 743.033203125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.4612102073909704, - "grad_norm": 3.591526985168457, - "kl": 8.703125, - "learning_rate": 6.874250601992019e-07, - "loss": 0.527, - "num_tokens": 767733882.0, - "reward": 1.80224609375, - "reward_std": 0.5414711833000183, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20228178799152374, + "grad_norm": 2.922914743423462, + "kl": 3.359375, + "learning_rate": 6.87730819757199e-07, + "loss": 0.1915, + "num_tokens": 837073201.0, + "reward": 1.01123046875, + "reward_std": 0.29407167434692383, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.20901910960674286, "step": 1351 }, { @@ -39194,27 +39194,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 766.748046875, - "completions/mean_terminated_length": 700.9754028320312, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 1744.0, + "completions/mean_length": 668.71875, + "completions/mean_terminated_length": 663.309814453125, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, "epoch": 0.46155159170436116, - "grad_norm": 3.2490854263305664, - "kl": 7.140625, - "learning_rate": 6.869144659575507e-07, - "loss": 0.4527, - "num_tokens": 768203945.0, - "reward": 1.876953125, - "reward_std": 0.5580621957778931, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.19265778362751007, + "grad_norm": 2.058774709701538, + "kl": 3.7734375, + "learning_rate": 6.872201485728812e-07, + "loss": 0.1964, + "num_tokens": 837493073.0, + "reward": 1.0302734375, + "reward_std": 0.3157753646373749, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.21141289174556732, "step": 1352 }, { @@ -39223,27 +39223,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1835.0, - "completions/mean_length": 807.17578125, - "completions/mean_terminated_length": 761.9635620117188, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/max_terminated_length": 1648.0, + "completions/mean_length": 776.578125, + "completions/mean_terminated_length": 761.5020141601562, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.461892976017752, - "grad_norm": 1.5669411420822144, - "kl": 6.7890625, - "learning_rate": 6.864036773908572e-07, - "loss": 0.4174, - "num_tokens": 768703747.0, - "reward": 1.810546875, - "reward_std": 0.5165926218032837, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.19499453902244568, + "grad_norm": 2.0698978900909424, + "kl": 3.765625, + "learning_rate": 6.867092824818639e-07, + "loss": 0.1831, + "num_tokens": 837977209.0, + "reward": 0.96533203125, + "reward_std": 0.279620885848999, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.89306640625, + "rewards/tag_count_reward/std": 0.21989093720912933, "step": 1353 }, { @@ -39252,27 +39252,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1608.0, - "completions/mean_length": 783.302734375, - "completions/mean_terminated_length": 737.2206420898438, - "completions/min_length": 23.0, - "completions/min_terminated_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1621.0, + "completions/max_terminated_length": 1621.0, + "completions/mean_length": 710.08203125, + "completions/mean_terminated_length": 710.08203125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.4622343603311428, - "grad_norm": 2.5091891288757324, - "kl": 5.4296875, - "learning_rate": 6.858926952240925e-07, - "loss": 0.3761, - "num_tokens": 769187614.0, - "reward": 1.830078125, - "reward_std": 0.5374591946601868, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.20074127614498138, + "grad_norm": 4.557255744934082, + "kl": 3.85546875, + "learning_rate": 6.861982222097785e-07, + "loss": 0.2094, + "num_tokens": 838423587.0, + "reward": 0.9482421875, + "reward_std": 0.26066210865974426, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.8876953125, + "rewards/tag_count_reward/std": 0.22466535866260529, "step": 1354 }, { @@ -39281,27 +39281,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 809.958984375, - "completions/mean_terminated_length": 751.72802734375, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/max_terminated_length": 1514.0, + "completions/mean_length": 707.169921875, + "completions/mean_terminated_length": 701.9118041992188, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, "epoch": 0.4625757446445336, - "grad_norm": 1.9437158107757568, - "kl": 6.796875, - "learning_rate": 6.853815201825016e-07, - "loss": 0.4782, - "num_tokens": 769676281.0, - "reward": 1.8330078125, - "reward_std": 0.5054324865341187, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.19831565022468567, + "grad_norm": 2.2567248344421387, + "kl": 3.18359375, + "learning_rate": 6.856869684825316e-07, + "loss": 0.1643, + "num_tokens": 838859626.0, + "reward": 0.9794921875, + "reward_std": 0.28309738636016846, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.8955078125, + "rewards/tag_count_reward/std": 0.21173806488513947, "step": 1355 }, { @@ -39310,27 +39310,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1930.0, - "completions/mean_length": 710.34375, - "completions/mean_terminated_length": 680.9740600585938, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1696.0, + "completions/max_terminated_length": 1696.0, + "completions/mean_length": 667.1015625, + "completions/mean_terminated_length": 667.1015625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, "epoch": 0.46291712895792436, - "grad_norm": 2.3564820289611816, - "kl": 4.17578125, - "learning_rate": 6.848701529916047e-07, - "loss": 0.2856, - "num_tokens": 770120425.0, - "reward": 1.873046875, - "reward_std": 0.41014260053634644, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.94921875, - "rewards/tag_count_reward/std": 0.15829749405384064, + "grad_norm": 2.9367079734802246, + "kl": 2.80859375, + "learning_rate": 6.851755220263055e-07, + "loss": 0.1399, + "num_tokens": 839281630.0, + "reward": 0.9482421875, + "reward_std": 0.23327293992042542, + "rewards/accuracy_reward/mean": 0.037109375, + "rewards/accuracy_reward/std": 0.18921469151973724, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.20556476712226868, "step": 1356 }, { @@ -39339,27 +39339,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1919.0, - "completions/mean_length": 736.501953125, - "completions/mean_terminated_length": 713.0357666015625, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 1647.0, + "completions/mean_length": 689.73828125, + "completions/mean_terminated_length": 684.4118041992188, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.4632585132713152, - "grad_norm": 2.157853841781616, - "kl": 4.48828125, - "learning_rate": 6.843585943771935e-07, - "loss": 0.3001, - "num_tokens": 770570266.0, - "reward": 1.859375, - "reward_std": 0.4438847303390503, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.1631019562482834, + "grad_norm": 4.065873146057129, + "kl": 3.13671875, + "learning_rate": 6.846638835675554e-07, + "loss": 0.1874, + "num_tokens": 839707528.0, + "reward": 0.9521484375, + "reward_std": 0.24419720470905304, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9052734375, + "rewards/tag_count_reward/std": 0.21229693293571472, "step": 1357 }, { @@ -39368,27 +39368,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1932.0, - "completions/mean_length": 797.623046875, - "completions/mean_terminated_length": 749.43408203125, - "completions/min_length": 94.0, - "completions/min_terminated_length": 94.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1673.0, + "completions/max_terminated_length": 1673.0, + "completions/mean_length": 686.5703125, + "completions/mean_terminated_length": 686.5703125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.463599897584706, - "grad_norm": 1.9284049272537231, - "kl": 6.4765625, - "learning_rate": 6.838468450653322e-07, - "loss": 0.3799, - "num_tokens": 771063209.0, - "reward": 1.830078125, - "reward_std": 0.597848653793335, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, + "grad_norm": 3.9942829608917236, + "kl": 2.625, + "learning_rate": 6.841520538330096e-07, + "loss": 0.1123, + "num_tokens": 840143612.0, + "reward": 1.02734375, + "reward_std": 0.27238187193870544, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.91015625, - "rewards/tag_count_reward/std": 0.20388682186603546, + "rewards/tag_count_reward/std": 0.18633443117141724, "step": 1358 }, { @@ -39397,27 +39397,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 721.576171875, - "completions/mean_terminated_length": 676.0222778320312, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_terminated_length": 1725.0, + "completions/mean_length": 666.833984375, + "completions/mean_terminated_length": 664.131103515625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, "epoch": 0.4639412818980968, - "grad_norm": 2.0085532665252686, - "kl": 6.125, - "learning_rate": 6.833349057823553e-07, - "loss": 0.3542, - "num_tokens": 771518112.0, - "reward": 1.75, - "reward_std": 0.5615205764770508, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.908203125, - "rewards/tag_count_reward/std": 0.2095356285572052, + "grad_norm": 4.302802085876465, + "kl": 2.7890625, + "learning_rate": 6.836400335496682e-07, + "loss": 0.158, + "num_tokens": 840570487.0, + "reward": 0.95068359375, + "reward_std": 0.2543538510799408, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.20248952507972717, "step": 1359 }, { @@ -39426,27 +39426,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 841.880859375, - "completions/mean_terminated_length": 766.8112182617188, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 736.978515625, + "completions/mean_terminated_length": 734.4129028320312, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.46428266621148756, - "grad_norm": 1.0249884128570557, - "kl": 6.2421875, - "learning_rate": 6.828227772548669e-07, - "loss": 0.4018, - "num_tokens": 772050259.0, - "reward": 1.7568359375, - "reward_std": 0.5212250351905823, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.20634421706199646, + "grad_norm": 3.2396957874298096, + "kl": 3.73828125, + "learning_rate": 6.831278234448019e-07, + "loss": 0.1621, + "num_tokens": 841048924.0, + "reward": 0.9365234375, + "reward_std": 0.256226509809494, + "rewards/accuracy_reward/mean": 0.029296875, + "rewards/accuracy_reward/std": 0.16880230605602264, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.8994140625, + "rewards/tag_count_reward/std": 0.2089945375919342, "step": 1360 }, { @@ -39455,27 +39455,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1936.0, - "completions/mean_length": 770.390625, - "completions/mean_terminated_length": 723.8380737304688, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1604.0, + "completions/max_terminated_length": 1604.0, + "completions/mean_length": 697.76171875, + "completions/mean_terminated_length": 697.76171875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.4646240505248784, - "grad_norm": 2.542614221572876, - "kl": 6.12109375, - "learning_rate": 6.823104602097398e-07, - "loss": 0.4169, - "num_tokens": 772525627.0, - "reward": 1.84619140625, - "reward_std": 0.578186571598053, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.18913322687149048, + "grad_norm": 2.220255136489868, + "kl": 3.724609375, + "learning_rate": 6.826154242459507e-07, + "loss": 0.1693, + "num_tokens": 841487106.0, + "reward": 0.98876953125, + "reward_std": 0.32350149750709534, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.88916015625, + "rewards/tag_count_reward/std": 0.2173822969198227, "step": 1361 }, { @@ -39484,27 +39484,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 820.62890625, - "completions/mean_terminated_length": 749.6239624023438, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 1790.0, + "completions/mean_length": 768.109375, + "completions/mean_terminated_length": 729.4808349609375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.4649654348382692, - "grad_norm": 1.1544690132141113, - "kl": 7.65625, - "learning_rate": 6.817979553741143e-07, - "loss": 0.5048, - "num_tokens": 773029709.0, - "reward": 1.7685546875, - "reward_std": 0.5714544057846069, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.9111328125, - "rewards/tag_count_reward/std": 0.21310555934906006, + "grad_norm": 2.7237648963928223, + "kl": 3.77734375, + "learning_rate": 6.821028366809238e-07, + "loss": 0.1787, + "num_tokens": 841964298.0, + "reward": 0.93896484375, + "reward_std": 0.2659711241722107, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.88427734375, + "rewards/tag_count_reward/std": 0.22043779492378235, "step": 1362 }, { @@ -39513,27 +39513,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 771.203125, - "completions/mean_terminated_length": 732.66796875, - "completions/min_length": 93.0, - "completions/min_terminated_length": 93.0, + "completions/max_terminated_length": 1653.0, + "completions/mean_length": 720.58984375, + "completions/mean_terminated_length": 717.9921875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, "epoch": 0.46530681915166, - "grad_norm": 0.9877282977104187, - "kl": 6.2109375, - "learning_rate": 6.812852634753974e-07, - "loss": 0.3745, - "num_tokens": 773505909.0, - "reward": 1.84130859375, - "reward_std": 0.5676465034484863, - "rewards/accuracy_reward/mean": 0.0927419364452362, - "rewards/accuracy_reward/std": 0.2903633117675781, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.19294461607933044, + "grad_norm": 2.8945419788360596, + "kl": 3.1328125, + "learning_rate": 6.815900614777972e-07, + "loss": 0.1254, + "num_tokens": 842414584.0, + "reward": 0.99560546875, + "reward_std": 0.27973347902297974, + "rewards/accuracy_reward/mean": 0.08870967477560043, + "rewards/accuracy_reward/std": 0.2846112847328186, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.90576171875, + "rewards/tag_count_reward/std": 0.20036904513835907, "step": 1363 }, { @@ -39542,27 +39542,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 807.451171875, - "completions/mean_terminated_length": 754.3931274414062, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 1752.0, + "completions/mean_length": 746.076171875, + "completions/mean_terminated_length": 743.5283813476562, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.46564820346505076, - "grad_norm": 2.3786368370056152, - "kl": 7.734375, - "learning_rate": 6.807723852412613e-07, - "loss": 0.4793, - "num_tokens": 773999612.0, - "reward": 1.71826171875, - "reward_std": 0.5370117425918579, - "rewards/accuracy_reward/mean": 0.009765625, - "rewards/accuracy_reward/std": 0.09843364357948303, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.90576171875, - "rewards/tag_count_reward/std": 0.20874005556106567, + "grad_norm": 3.7866604328155518, + "kl": 2.83203125, + "learning_rate": 6.81077099364914e-07, + "loss": 0.1399, + "num_tokens": 842876863.0, + "reward": 0.95068359375, + "reward_std": 0.2132580578327179, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15143637359142303, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.18584084510803223, "step": 1364 }, { @@ -39571,27 +39571,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 836.08203125, - "completions/mean_terminated_length": 794.4606323242188, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 774.990234375, + "completions/mean_terminated_length": 769.9981079101562, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, "epoch": 0.46598958777844157, - "grad_norm": 1.763398289680481, - "kl": 6.75, - "learning_rate": 6.802593213996431e-07, - "loss": 0.4171, - "num_tokens": 774510582.0, - "reward": 1.79736328125, - "reward_std": 0.5802618265151978, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.20427954196929932, + "grad_norm": 2.5615627765655518, + "kl": 3.48046875, + "learning_rate": 6.805639510708826e-07, + "loss": 0.1689, + "num_tokens": 843356554.0, + "reward": 0.9638671875, + "reward_std": 0.2721315026283264, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.8974609375, + "rewards/tag_count_reward/std": 0.2178066223859787, "step": 1365 }, { @@ -39600,27 +39600,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1915.0, - "completions/mean_length": 784.43359375, - "completions/mean_terminated_length": 738.3927612304688, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 1888.0, + "completions/mean_length": 760.59765625, + "completions/mean_terminated_length": 755.549072265625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.4663309720918324, - "grad_norm": 15.666436195373535, - "kl": 5.6484375, - "learning_rate": 6.797460726787427e-07, - "loss": 0.3623, - "num_tokens": 774986852.0, - "reward": 1.81982421875, - "reward_std": 0.5481522679328918, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19323164224624634, + "grad_norm": 3.0792670249938965, + "kl": 2.7578125, + "learning_rate": 6.80050617324576e-07, + "loss": 0.1009, + "num_tokens": 843820620.0, + "reward": 0.99609375, + "reward_std": 0.2847621440887451, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.20392431318759918, "step": 1366 }, { @@ -39629,27 +39629,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 780.939453125, - "completions/mean_terminated_length": 732.1074829101562, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 1677.0, + "completions/mean_length": 714.890625, + "completions/mean_terminated_length": 707.033447265625, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, "epoch": 0.4666723564052232, - "grad_norm": 1.3910210132598877, - "kl": 5.87890625, - "learning_rate": 6.792326398070233e-07, - "loss": 0.3933, - "num_tokens": 775457109.0, - "reward": 1.8701171875, - "reward_std": 0.5559483170509338, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.19536417722702026, + "grad_norm": 3.1971635818481445, + "kl": 2.451171875, + "learning_rate": 6.795370988551301e-07, + "loss": 0.1113, + "num_tokens": 844257060.0, + "reward": 1.0625, + "reward_std": 0.2795104384422302, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.18388766050338745, "step": 1367 }, { @@ -39658,27 +39658,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 794.640625, - "completions/mean_terminated_length": 761.9879760742188, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 1576.0, + "completions/mean_length": 749.931640625, + "completions/mean_terminated_length": 747.391357421875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.46701374071861396, - "grad_norm": 3.575126886367798, - "kl": 5.0859375, - "learning_rate": 6.787190235132085e-07, - "loss": 0.3598, - "num_tokens": 775936445.0, - "reward": 1.81201171875, - "reward_std": 0.47381865978240967, + "grad_norm": 5.762070178985596, + "kl": 2.91796875, + "learning_rate": 6.790233963919437e-07, + "loss": 0.1144, + "num_tokens": 844713505.0, + "reward": 0.9443359375, + "reward_std": 0.2445402294397354, "rewards/accuracy_reward/mean": 0.024193547666072845, "rewards/accuracy_reward/std": 0.15380479395389557, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.18643119931221008, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.20311442017555237, "step": 1368 }, { @@ -39687,27 +39687,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 764.12890625, - "completions/mean_terminated_length": 728.0361328125, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1728.0, + "completions/max_terminated_length": 1728.0, + "completions/mean_length": 745.609375, + "completions/mean_terminated_length": 745.609375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.46735512503200477, - "grad_norm": 2.987510919570923, - "kl": 5.796875, - "learning_rate": 6.782052245262829e-07, - "loss": 0.3871, - "num_tokens": 776405071.0, - "reward": 1.80419921875, - "reward_std": 0.5042265057563782, - "rewards/accuracy_reward/mean": 0.030241934582591057, - "rewards/accuracy_reward/std": 0.1714252382516861, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.18635430932044983, + "grad_norm": 5.123167514801025, + "kl": 2.2265625, + "learning_rate": 6.78509510664677e-07, + "loss": 0.1101, + "num_tokens": 845172649.0, + "reward": 0.978515625, + "reward_std": 0.2383328080177307, + "rewards/accuracy_reward/mean": 0.052419353276491165, + "rewards/accuracy_reward/std": 0.22309619188308716, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.18796826899051666, "step": 1369 }, { @@ -39716,27 +39716,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 796.1015625, - "completions/mean_terminated_length": 745.2113647460938, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1742.0, + "completions/max_terminated_length": 1742.0, + "completions/mean_length": 777.22265625, + "completions/mean_terminated_length": 777.22265625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.4676965093453956, - "grad_norm": 1.3319543600082397, - "kl": 6.6328125, - "learning_rate": 6.7769124357549e-07, - "loss": 0.4311, - "num_tokens": 776897299.0, - "reward": 1.83935546875, - "reward_std": 0.5210530757904053, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19386357069015503, + "grad_norm": 3.2751119136810303, + "kl": 2.66796875, + "learning_rate": 6.779954424032501e-07, + "loss": 0.1207, + "num_tokens": 845655211.0, + "reward": 0.9990234375, + "reward_std": 0.2641468644142151, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.1879250556230545, "step": 1370 }, { @@ -39745,27 +39745,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 717.728515625, - "completions/mean_terminated_length": 672.04248046875, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1880.0, + "completions/max_terminated_length": 1880.0, + "completions/mean_length": 670.2734375, + "completions/mean_terminated_length": 670.2734375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, "epoch": 0.4680378936587864, - "grad_norm": 3.551114320755005, - "kl": 7.1953125, - "learning_rate": 6.77177081390332e-07, - "loss": 0.4403, - "num_tokens": 777334968.0, - "reward": 1.818359375, - "reward_std": 0.5296976566314697, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.19642995297908783, + "grad_norm": 2.82075834274292, + "kl": 3.09765625, + "learning_rate": 6.774811923378424e-07, + "loss": 0.1551, + "num_tokens": 846068583.0, + "reward": 1.04931640625, + "reward_std": 0.2785525918006897, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17600135505199432, "step": 1371 }, { @@ -39774,27 +39774,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1938.0, - "completions/mean_length": 783.712890625, - "completions/mean_terminated_length": 750.7755737304688, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1869.0, + "completions/max_terminated_length": 1869.0, + "completions/mean_length": 743.431640625, + "completions/mean_terminated_length": 743.431640625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, "epoch": 0.46837927797217715, - "grad_norm": 1.4473611116409302, - "kl": 6.6015625, - "learning_rate": 6.76662738700568e-07, - "loss": 0.4135, - "num_tokens": 777804229.0, - "reward": 1.90625, - "reward_std": 0.5176564455032349, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.18438583612442017, + "grad_norm": 3.61566162109375, + "kl": 3.6015625, + "learning_rate": 6.769667611988922e-07, + "loss": 0.1543, + "num_tokens": 846517220.0, + "reward": 0.990234375, + "reward_std": 0.26061537861824036, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.904296875, + "rewards/tag_count_reward/std": 0.20361481606960297, "step": 1372 }, { @@ -39803,27 +39803,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 785.662109375, - "completions/mean_terminated_length": 726.288330078125, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1877.0, + "completions/max_terminated_length": 1877.0, + "completions/mean_length": 745.294921875, + "completions/mean_terminated_length": 745.294921875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, "epoch": 0.46872066228556797, - "grad_norm": 3.8746228218078613, - "kl": 8.3359375, - "learning_rate": 6.761482162362134e-07, - "loss": 0.4935, - "num_tokens": 778278056.0, - "reward": 1.751953125, - "reward_std": 0.5673034191131592, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.21558640897274017, + "grad_norm": 3.1772000789642334, + "kl": 2.94140625, + "learning_rate": 6.764521497170938e-07, + "loss": 0.1313, + "num_tokens": 846970379.0, + "reward": 0.998046875, + "reward_std": 0.2550550401210785, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.18388766050338745, "step": 1373 }, { @@ -39832,27 +39832,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 795.357421875, - "completions/mean_terminated_length": 741.7821044921875, - "completions/min_length": 193.0, - "completions/min_terminated_length": 193.0, + "completions/max_terminated_length": 1797.0, + "completions/mean_length": 742.609375, + "completions/mean_terminated_length": 740.0548095703125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, "epoch": 0.4690620465989588, - "grad_norm": 3.369460105895996, - "kl": 9.390625, - "learning_rate": 6.756335147275387e-07, - "loss": 0.6074, - "num_tokens": 778760831.0, - "reward": 1.80712890625, - "reward_std": 0.5608662366867065, - "rewards/accuracy_reward/mean": 0.06854838877916336, - "rewards/accuracy_reward/std": 0.25293973088264465, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.2099587321281433, + "grad_norm": 1.9792194366455078, + "kl": 2.56640625, + "learning_rate": 6.759373586233988e-07, + "loss": 0.1079, + "num_tokens": 847426147.0, + "reward": 1.04150390625, + "reward_std": 0.27639952301979065, + "rewards/accuracy_reward/mean": 0.11088709533214569, + "rewards/accuracy_reward/std": 0.3143092691898346, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.1785990297794342, "step": 1374 }, { @@ -39861,27 +39861,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 825.08984375, - "completions/mean_terminated_length": 754.3429565429688, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1993.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 778.703125, + "completions/mean_terminated_length": 778.703125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.4694034309123496, - "grad_norm": 1.8571772575378418, - "kl": 8.59375, - "learning_rate": 6.751186349050683e-07, - "loss": 0.5455, - "num_tokens": 779264509.0, - "reward": 1.76318359375, - "reward_std": 0.597456693649292, - "rewards/accuracy_reward/mean": 0.05443548411130905, - "rewards/accuracy_reward/std": 0.227104052901268, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.2084423005580902, + "grad_norm": 4.733518123626709, + "kl": 4.296875, + "learning_rate": 6.754223886490136e-07, + "loss": 0.171, + "num_tokens": 847906075.0, + "reward": 0.98779296875, + "reward_std": 0.28858983516693115, + "rewards/accuracy_reward/mean": 0.09677419066429138, + "rewards/accuracy_reward/std": 0.2959485352039337, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.89208984375, + "rewards/tag_count_reward/std": 0.21660728752613068, "step": 1375 }, { @@ -39890,27 +39890,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 801.888671875, - "completions/mean_terminated_length": 764.2796630859375, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 738.2734375, + "completions/mean_terminated_length": 733.1372680664062, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.46974481522574035, - "grad_norm": 1.4824538230895996, - "kl": 6.3984375, - "learning_rate": 6.746035774995805e-07, - "loss": 0.417, - "num_tokens": 779752532.0, - "reward": 1.83544921875, - "reward_std": 0.48997873067855835, - "rewards/accuracy_reward/mean": 0.052419353276491165, - "rewards/accuracy_reward/std": 0.22309619188308716, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.18445254862308502, + "grad_norm": 3.7451682090759277, + "kl": 3.41015625, + "learning_rate": 6.749072405253981e-07, + "loss": 0.1558, + "num_tokens": 848361527.0, + "reward": 1.00830078125, + "reward_std": 0.2626035809516907, + "rewards/accuracy_reward/mean": 0.08870967477560043, + "rewards/accuracy_reward/std": 0.284611314535141, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.19126836955547333, "step": 1376 }, { @@ -39919,27 +39919,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 767.33984375, - "completions/mean_terminated_length": 728.6881103515625, - "completions/min_length": 78.0, - "completions/min_terminated_length": 78.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1886.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 706.33203125, + "completions/mean_terminated_length": 706.33203125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, "epoch": 0.47008619953913117, - "grad_norm": 1.6078094244003296, - "kl": 5.5, - "learning_rate": 6.740883432421044e-07, - "loss": 0.3703, - "num_tokens": 780219154.0, - "reward": 1.84619140625, - "reward_std": 0.5149907469749451, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.18399609625339508, + "grad_norm": 6.075878143310547, + "kl": 4.390625, + "learning_rate": 6.743919149842661e-07, + "loss": 0.2079, + "num_tokens": 848796913.0, + "reward": 0.98828125, + "reward_std": 0.28618401288986206, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.21373461186885834, "step": 1377 }, { @@ -39948,27 +39948,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 856.470703125, - "completions/mean_terminated_length": 795.303955078125, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1521.0, + "completions/max_terminated_length": 1521.0, + "completions/mean_length": 759.90234375, + "completions/mean_terminated_length": 759.90234375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.470427583852522, - "grad_norm": 2.1424074172973633, - "kl": 6.203125, - "learning_rate": 6.735729328639213e-07, - "loss": 0.406, - "num_tokens": 780738467.0, - "reward": 1.77685546875, - "reward_std": 0.5383716821670532, - "rewards/accuracy_reward/mean": 0.024193547666072845, - "rewards/accuracy_reward/std": 0.15380479395389557, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.2095487415790558, + "grad_norm": 1.666942834854126, + "kl": 3.1953125, + "learning_rate": 6.738764127575828e-07, + "loss": 0.1178, + "num_tokens": 849266783.0, + "reward": 0.9375, + "reward_std": 0.21183615922927856, + "rewards/accuracy_reward/mean": 0.02217741869390011, + "rewards/accuracy_reward/std": 0.14740893244743347, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.19368666410446167, "step": 1378 }, { @@ -39977,27 +39977,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 833.509765625, - "completions/mean_terminated_length": 776.386474609375, - "completions/min_length": 236.0, - "completions/min_terminated_length": 236.0, + "completions/max_terminated_length": 1595.0, + "completions/mean_length": 702.958984375, + "completions/mean_terminated_length": 700.3267822265625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.4707689681659128, - "grad_norm": 2.3079988956451416, - "kl": 6.8125, - "learning_rate": 6.730573470965618e-07, - "loss": 0.4914, - "num_tokens": 781239752.0, - "reward": 1.77685546875, - "reward_std": 0.5395263433456421, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.2032666653394699, + "grad_norm": 2.7315187454223633, + "kl": 3.3671875, + "learning_rate": 6.733607345775646e-07, + "loss": 0.1258, + "num_tokens": 849701226.0, + "reward": 0.98486328125, + "reward_std": 0.25739458203315735, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.19359229505062103, "step": 1379 }, { @@ -40006,27 +40006,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1948.0, - "completions/mean_length": 745.91015625, - "completions/mean_terminated_length": 701.1919555664062, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 1798.0, + "completions/mean_length": 692.083984375, + "completions/mean_terminated_length": 684.0923461914062, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, "epoch": 0.47111035247930355, - "grad_norm": 1.399194598197937, - "kl": 5.16015625, - "learning_rate": 6.725415866718055e-07, - "loss": 0.3408, - "num_tokens": 781711930.0, - "reward": 1.85107421875, - "reward_std": 0.47656652331352234, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.1851712167263031, + "grad_norm": 2.950645923614502, + "kl": 3.69921875, + "learning_rate": 6.728448811766776e-07, + "loss": 0.1873, + "num_tokens": 850145845.0, + "reward": 0.99462890625, + "reward_std": 0.2600945234298706, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.2146575003862381, "step": 1380 }, { @@ -40035,27 +40035,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1958.0, - "completions/mean_length": 741.94140625, - "completions/mean_terminated_length": 677.708984375, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/max_terminated_length": 1593.0, + "completions/mean_length": 665.5, + "completions/mean_terminated_length": 662.7944946289062, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.47145173679269436, - "grad_norm": 3.3459203243255615, - "kl": 6.53125, - "learning_rate": 6.720256523216802e-07, - "loss": 0.3746, - "num_tokens": 782176380.0, - "reward": 1.79150390625, - "reward_std": 0.5550722479820251, - "rewards/accuracy_reward/mean": 0.06451612710952759, - "rewards/accuracy_reward/std": 0.2459181249141693, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.20440112054347992, + "grad_norm": 4.174452304840088, + "kl": 3.7734375, + "learning_rate": 6.723288532876372e-07, + "loss": 0.1763, + "num_tokens": 850571157.0, + "reward": 1.0146484375, + "reward_std": 0.26300713419914246, + "rewards/accuracy_reward/mean": 0.10080645233392715, + "rewards/accuracy_reward/std": 0.30137622356414795, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.20275656878948212, "step": 1381 }, { @@ -40064,27 +40064,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.078125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 893.115234375, - "completions/mean_terminated_length": 795.24365234375, - "completions/min_length": 231.0, - "completions/min_terminated_length": 231.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 719.642578125, + "completions/mean_terminated_length": 714.433349609375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.4717931211060852, - "grad_norm": 2.397712469100952, - "kl": 8.375, - "learning_rate": 6.715095447784602e-07, - "loss": 0.5589, - "num_tokens": 782708455.0, - "reward": 1.69482421875, - "reward_std": 0.5570254325866699, - "rewards/accuracy_reward/mean": 0.01171875, - "rewards/accuracy_reward/std": 0.10772226005792618, - "rewards/format_reward/mean": 0.794921875, - "rewards/format_reward/std": 0.4041535556316376, - "rewards/tag_count_reward/mean": 0.88818359375, - "rewards/tag_count_reward/std": 0.23107852041721344, + "grad_norm": 2.004256248474121, + "kl": 3.97265625, + "learning_rate": 6.718126516434065e-07, + "loss": 0.1597, + "num_tokens": 851014414.0, + "reward": 0.9140625, + "reward_std": 0.2476644217967987, + "rewards/accuracy_reward/mean": 0.01953125, + "rewards/accuracy_reward/std": 0.1385180652141571, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.892578125, + "rewards/tag_count_reward/std": 0.22048601508140564, "step": 1382 }, { @@ -40093,27 +40093,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.07421875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 866.09765625, - "completions/mean_terminated_length": 771.345947265625, - "completions/min_length": 115.0, - "completions/min_terminated_length": 115.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1787.0, + "completions/max_terminated_length": 1787.0, + "completions/mean_length": 753.98046875, + "completions/mean_terminated_length": 753.98046875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.472134505419476, - "grad_norm": 1.7249161005020142, - "kl": 8.0703125, - "learning_rate": 6.709932647746659e-07, - "loss": 0.5049, - "num_tokens": 783222281.0, - "reward": 1.70947265625, - "reward_std": 0.597592830657959, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.77734375, - "rewards/format_reward/std": 0.41643625497817993, - "rewards/tag_count_reward/mean": 0.88720703125, - "rewards/tag_count_reward/std": 0.22953926026821136, + "grad_norm": 1.8866486549377441, + "kl": 3.4375, + "learning_rate": 6.71296276977195e-07, + "loss": 0.1761, + "num_tokens": 851470836.0, + "reward": 0.9736328125, + "reward_std": 0.25815194845199585, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.20435261726379395, "step": 1383 }, { @@ -40122,27 +40122,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 775.4921875, - "completions/mean_terminated_length": 712.9097900390625, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 1651.0, + "completions/mean_length": 675.65234375, + "completions/mean_terminated_length": 667.5639038085938, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, "epoch": 0.47247588973286675, - "grad_norm": 2.245136260986328, - "kl": 7.265625, - "learning_rate": 6.704768130430619e-07, - "loss": 0.4825, - "num_tokens": 783694165.0, - "reward": 1.78857421875, - "reward_std": 0.5597871541976929, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.796875, - "rewards/format_reward/std": 0.4027182459831238, - "rewards/tag_count_reward/mean": 0.90576171875, - "rewards/tag_count_reward/std": 0.21164949238300323, + "grad_norm": 3.5500969886779785, + "kl": 3.697265625, + "learning_rate": 6.707797300224585e-07, + "loss": 0.1999, + "num_tokens": 851891602.0, + "reward": 1.02001953125, + "reward_std": 0.28511446714401245, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.20652645826339722, "step": 1384 }, { @@ -40151,27 +40151,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 783.619140625, - "completions/mean_terminated_length": 732.2214965820312, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 1508.0, + "completions/mean_length": 641.888671875, + "completions/mean_terminated_length": 639.136962890625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.47281727404625756, - "grad_norm": 2.1172027587890625, - "kl": 5.484375, - "learning_rate": 6.699601903166575e-07, - "loss": 0.3793, - "num_tokens": 784180482.0, - "reward": 1.83544921875, - "reward_std": 0.47278302907943726, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.1837882399559021, + "grad_norm": 2.781998872756958, + "kl": 3.4765625, + "learning_rate": 6.702630115128971e-07, + "loss": 0.1636, + "num_tokens": 852305353.0, + "reward": 0.98876953125, + "reward_std": 0.28356242179870605, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.21121110022068024, "step": 1385 }, { @@ -40180,27 +40180,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1858.0, - "completions/mean_length": 856.73046875, - "completions/mean_terminated_length": 774.6597290039062, - "completions/min_length": 184.0, - "completions/min_terminated_length": 184.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 766.630859375, + "completions/mean_terminated_length": 764.123291015625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.4731586583596484, - "grad_norm": 1.7951167821884155, - "kl": 7.1953125, - "learning_rate": 6.69443397328704e-07, - "loss": 0.4726, - "num_tokens": 784693864.0, - "reward": 1.83056640625, - "reward_std": 0.6076481342315674, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.90283203125, - "rewards/tag_count_reward/std": 0.2166205197572708, + "grad_norm": 1.7263718843460083, + "kl": 3.1796875, + "learning_rate": 6.69746122182455e-07, + "loss": 0.1213, + "num_tokens": 852772604.0, + "reward": 1.0341796875, + "reward_std": 0.32438021898269653, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9013671875, + "rewards/tag_count_reward/std": 0.21224290132522583, "step": 1386 }, { @@ -40209,27 +40209,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.07421875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1947.0, - "completions/mean_length": 850.3203125, - "completions/mean_terminated_length": 754.3037719726562, - "completions/min_length": 182.0, - "completions/min_terminated_length": 182.0, + "completions/max_terminated_length": 1700.0, + "completions/mean_length": 723.474609375, + "completions/mean_terminated_length": 720.882568359375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, "epoch": 0.4735000426730392, - "grad_norm": 1.806761384010315, - "kl": 7.796875, - "learning_rate": 6.689264348126944e-07, - "loss": 0.5443, - "num_tokens": 785207484.0, - "reward": 1.75732421875, - "reward_std": 0.5716904401779175, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.89794921875, - "rewards/tag_count_reward/std": 0.2391698956489563, + "grad_norm": 4.256781578063965, + "kl": 3.24609375, + "learning_rate": 6.692290627653186e-07, + "loss": 0.139, + "num_tokens": 853221279.0, + "reward": 0.9765625, + "reward_std": 0.2756841778755188, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.21215508878231049, "step": 1387 }, { @@ -40238,27 +40238,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1954.0, - "completions/mean_length": 790.935546875, - "completions/mean_terminated_length": 712.695068359375, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1621.0, + "completions/mean_length": 682.078125, + "completions/mean_terminated_length": 679.4050903320312, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.47384142698642995, - "grad_norm": 1.3584413528442383, - "kl": 8.265625, - "learning_rate": 6.684093035023626e-07, - "loss": 0.5407, - "num_tokens": 785686187.0, - "reward": 1.81494140625, - "reward_std": 0.6129561066627502, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, - "rewards/format_reward/mean": 0.796875, - "rewards/format_reward/std": 0.4027182459831238, - "rewards/tag_count_reward/mean": 0.89892578125, - "rewards/tag_count_reward/std": 0.22319906949996948, + "grad_norm": 2.429532766342163, + "kl": 3.421875, + "learning_rate": 6.68711833995916e-07, + "loss": 0.1801, + "num_tokens": 853644247.0, + "reward": 1.0654296875, + "reward_std": 0.3016743063926697, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.20275656878948212, "step": 1388 }, { @@ -40267,27 +40267,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1997.0, - "completions/mean_length": 781.400390625, - "completions/mean_terminated_length": 729.91259765625, - "completions/min_length": 12.0, - "completions/min_terminated_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1678.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 648.642578125, + "completions/mean_terminated_length": 648.642578125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.47418281129982076, - "grad_norm": 2.051622152328491, - "kl": 7.609375, - "learning_rate": 6.678920041316818e-07, - "loss": 0.4913, - "num_tokens": 786168712.0, - "reward": 1.8271484375, - "reward_std": 0.5685397386550903, - "rewards/accuracy_reward/mean": 0.07459677755832672, - "rewards/accuracy_reward/std": 0.263004869222641, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.19635941088199615, + "grad_norm": 2.415675640106201, + "kl": 2.95703125, + "learning_rate": 6.681944366089162e-07, + "loss": 0.1097, + "num_tokens": 854058800.0, + "reward": 1.0068359375, + "reward_std": 0.26265156269073486, + "rewards/accuracy_reward/mean": 0.09072580933570862, + "rewards/accuracy_reward/std": 0.2875087857246399, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.19583314657211304, "step": 1389 }, { @@ -40296,27 +40296,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 810.513671875, - "completions/mean_terminated_length": 730.7588500976562, - "completions/min_length": 204.0, - "completions/min_terminated_length": 204.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1775.0, + "completions/max_terminated_length": 1775.0, + "completions/mean_length": 702.185546875, + "completions/mean_terminated_length": 702.185546875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.4745241956132116, - "grad_norm": 3.496229648590088, - "kl": 8.8046875, - "learning_rate": 6.67374537434864e-07, - "loss": 0.5496, - "num_tokens": 786655439.0, - "reward": 1.77734375, - "reward_std": 0.5651187896728516, - "rewards/accuracy_reward/mean": 0.06451612710952759, - "rewards/accuracy_reward/std": 0.2459181249141693, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.90234375, - "rewards/tag_count_reward/std": 0.2211524099111557, + "grad_norm": 2.079864025115967, + "kl": 4.08984375, + "learning_rate": 6.676768713392272e-07, + "loss": 0.1965, + "num_tokens": 854490063.0, + "reward": 0.986328125, + "reward_std": 0.2514050006866455, + "rewards/accuracy_reward/mean": 0.07459677755832672, + "rewards/accuracy_reward/std": 0.263004869222641, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.20659643411636353, "step": 1390 }, { @@ -40325,27 +40325,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 844.98046875, - "completions/mean_terminated_length": 778.00830078125, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/max_terminated_length": 1699.0, + "completions/mean_length": 687.373046875, + "completions/mean_terminated_length": 684.7103881835938, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.4748655799266024, - "grad_norm": 1.6232308149337769, - "kl": 8.71875, - "learning_rate": 6.668569041463582e-07, - "loss": 0.5677, - "num_tokens": 787171461.0, - "reward": 1.6982421875, - "reward_std": 0.5887287855148315, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.779296875, - "rewards/format_reward/std": 0.4151262938976288, - "rewards/tag_count_reward/mean": 0.8896484375, - "rewards/tag_count_reward/std": 0.23257318139076233, + "grad_norm": 1.9395735263824463, + "kl": 3.10546875, + "learning_rate": 6.67159138921996e-07, + "loss": 0.1274, + "num_tokens": 854925390.0, + "reward": 0.9619140625, + "reward_std": 0.2960132658481598, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.8994140625, + "rewards/tag_count_reward/std": 0.21074290573596954, "step": 1391 }, { @@ -40354,27 +40354,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 829.41015625, - "completions/mean_terminated_length": 764.2180786132812, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1664.0, + "completions/max_terminated_length": 1664.0, + "completions/mean_length": 708.71484375, + "completions/mean_terminated_length": 708.71484375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.47520696423999315, - "grad_norm": 1.2676295042037964, - "kl": 6.96875, - "learning_rate": 6.663391050008505e-07, - "loss": 0.476, - "num_tokens": 787675415.0, - "reward": 1.81396484375, - "reward_std": 0.504416823387146, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.19436074793338776, + "grad_norm": 2.037200689315796, + "kl": 3.3125, + "learning_rate": 6.666412400926063e-07, + "loss": 0.1243, + "num_tokens": 855367548.0, + "reward": 0.92626953125, + "reward_std": 0.22598353028297424, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15143637359142303, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.90087890625, + "rewards/tag_count_reward/std": 0.21230311691761017, "step": 1392 }, { @@ -40383,27 +40383,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 792.07421875, - "completions/mean_terminated_length": 741.0203247070312, - "completions/min_length": 71.0, - "completions/min_terminated_length": 71.0, + "completions/max_terminated_length": 1652.0, + "completions/mean_length": 731.40234375, + "completions/mean_terminated_length": 728.8258056640625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, "epoch": 0.47554834855338396, - "grad_norm": 2.010774850845337, - "kl": 7.109375, - "learning_rate": 6.658211407332619e-07, - "loss": 0.4803, - "num_tokens": 788162637.0, - "reward": 1.8095703125, - "reward_std": 0.5974607467651367, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.211032897233963, + "grad_norm": 2.616205930709839, + "kl": 3.171875, + "learning_rate": 6.66123175586679e-07, + "loss": 0.1478, + "num_tokens": 855823706.0, + "reward": 1.00927734375, + "reward_std": 0.2902449369430542, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.2008453756570816, "step": 1393 }, { @@ -40412,27 +40412,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 790.86328125, - "completions/mean_terminated_length": 726.3285522460938, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1558.0, + "completions/max_terminated_length": 1558.0, + "completions/mean_length": 710.7109375, + "completions/mean_terminated_length": 710.7109375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, "epoch": 0.4758897328667748, - "grad_norm": 1.3909481763839722, - "kl": 8.2578125, - "learning_rate": 6.65303012078748e-07, - "loss": 0.5387, - "num_tokens": 788641143.0, - "reward": 1.78564453125, - "reward_std": 0.6270242929458618, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.80078125, - "rewards/format_reward/std": 0.39980348944664, - "rewards/tag_count_reward/mean": 0.89697265625, - "rewards/tag_count_reward/std": 0.22773799300193787, + "grad_norm": 1.856652021408081, + "kl": 2.8515625, + "learning_rate": 6.656049461400695e-07, + "loss": 0.1163, + "num_tokens": 856261174.0, + "reward": 1.0458984375, + "reward_std": 0.3149603307247162, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.19910427927970886, "step": 1394 }, { @@ -40441,27 +40441,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1699.0, - "completions/mean_length": 804.697265625, - "completions/mean_terminated_length": 751.5214233398438, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1893.0, + "completions/max_terminated_length": 1893.0, + "completions/mean_length": 733.474609375, + "completions/mean_terminated_length": 733.474609375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, "epoch": 0.4762311171801656, - "grad_norm": 2.7253928184509277, - "kl": 5.2421875, - "learning_rate": 6.647847197726978e-07, - "loss": 0.3513, - "num_tokens": 789132396.0, - "reward": 1.876953125, - "reward_std": 0.494552344083786, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.17901119589805603, + "grad_norm": 1.6154507398605347, + "kl": 2.494140625, + "learning_rate": 6.65086552488868e-07, + "loss": 0.0708, + "num_tokens": 856715961.0, + "reward": 0.9755859375, + "reward_std": 0.2549571990966797, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.20371569693088531, "step": 1395 }, { @@ -40470,27 +40470,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 865.7578125, - "completions/mean_terminated_length": 805.0678100585938, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1795.0, + "completions/max_terminated_length": 1795.0, + "completions/mean_length": 767.9609375, + "completions/mean_terminated_length": 767.9609375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, "epoch": 0.47657250149355634, - "grad_norm": 2.511202335357666, - "kl": 6.2890625, - "learning_rate": 6.642662645507322e-07, - "loss": 0.4536, - "num_tokens": 789652256.0, - "reward": 1.83154296875, - "reward_std": 0.5061460733413696, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.18896137177944183, + "grad_norm": 1.6102898120880127, + "kl": 2.0859375, + "learning_rate": 6.645679953693981e-07, + "loss": 0.0859, + "num_tokens": 857185749.0, + "reward": 0.97412109375, + "reward_std": 0.2627238631248474, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.19384385645389557, "step": 1396 }, { @@ -40499,27 +40499,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 856.08984375, - "completions/mean_terminated_length": 797.4712524414062, - "completions/min_length": 2.0, - "completions/min_terminated_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1966.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 764.26953125, + "completions/mean_terminated_length": 764.26953125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.47691388580694716, - "grad_norm": 3.641901731491089, - "kl": 8.0546875, - "learning_rate": 6.637476471487036e-07, - "loss": 0.4617, - "num_tokens": 790164814.0, - "reward": 1.7822265625, - "reward_std": 0.5402143597602844, - "rewards/accuracy_reward/mean": 0.05443548411130905, - "rewards/accuracy_reward/std": 0.227104052901268, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.20055793225765228, + "grad_norm": 3.021604537963867, + "kl": 1.921875, + "learning_rate": 6.640492755182152e-07, + "loss": 0.0734, + "num_tokens": 857651295.0, + "reward": 1.01220703125, + "reward_std": 0.25866368412971497, + "rewards/accuracy_reward/mean": 0.0786290317773819, + "rewards/accuracy_reward/std": 0.26943066716194153, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17390406131744385, "step": 1397 }, { @@ -40528,27 +40528,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1811.0, - "completions/mean_length": 817.111328125, - "completions/mean_terminated_length": 751.2612915039062, - "completions/min_length": 20.0, - "completions/min_terminated_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1815.0, + "completions/max_terminated_length": 1815.0, + "completions/mean_length": 753.896484375, + "completions/mean_terminated_length": 753.896484375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, "epoch": 0.47725527012033797, - "grad_norm": 2.3625988960266113, - "kl": 7.6953125, - "learning_rate": 6.632288683026946e-07, - "loss": 0.4697, - "num_tokens": 790654695.0, - "reward": 1.8330078125, - "reward_std": 0.5581883788108826, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.2031896710395813, + "grad_norm": 1.5214219093322754, + "kl": 2.76171875, + "learning_rate": 6.635303936721056e-07, + "loss": 0.1394, + "num_tokens": 858108810.0, + "reward": 1.02099609375, + "reward_std": 0.3259813189506531, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.210186168551445, "step": 1398 }, { @@ -40557,27 +40557,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 853.94921875, - "completions/mean_terminated_length": 792.6530151367188, - "completions/min_length": 32.0, - "completions/min_terminated_length": 32.0, + "completions/max_terminated_length": 1817.0, + "completions/mean_length": 788.740234375, + "completions/mean_terminated_length": 783.802001953125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.4775966544337288, - "grad_norm": 3.0895321369171143, - "kl": 8.1875, - "learning_rate": 6.62709928749017e-07, - "loss": 0.4903, - "num_tokens": 791178909.0, - "reward": 1.70703125, - "reward_std": 0.5963179469108582, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.771484375, - "rewards/format_reward/std": 0.4202871024608612, - "rewards/tag_count_reward/mean": 0.88671875, - "rewards/tag_count_reward/std": 0.22903135418891907, + "grad_norm": 3.359818458557129, + "kl": 2.826171875, + "learning_rate": 6.630113505680864e-07, + "loss": 0.1036, + "num_tokens": 858599637.0, + "reward": 0.97607421875, + "reward_std": 0.29234710335731506, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.90185546875, + "rewards/tag_count_reward/std": 0.20455066859722137, "step": 1399 }, { @@ -40586,27 +40586,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 798.802734375, - "completions/mean_terminated_length": 755.9010620117188, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 1809.0, + "completions/mean_length": 750.138671875, + "completions/mean_terminated_length": 747.5988159179688, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, "epoch": 0.47793803874711954, - "grad_norm": 1.0735238790512085, - "kl": 5.6328125, - "learning_rate": 6.621908292242104e-07, - "loss": 0.33, - "num_tokens": 791664888.0, - "reward": 1.7802734375, - "reward_std": 0.5403667688369751, - "rewards/accuracy_reward/mean": 0.05443548411130905, - "rewards/accuracy_reward/std": 0.227104052901268, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.19458001852035522, + "grad_norm": 4.321170806884766, + "kl": 3.09765625, + "learning_rate": 6.624921469434035e-07, + "loss": 0.1712, + "num_tokens": 859060700.0, + "reward": 0.9716796875, + "reward_std": 0.25452613830566406, + "rewards/accuracy_reward/mean": 0.060483869165182114, + "rewards/accuracy_reward/std": 0.2386218160390854, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9091796875, + "rewards/tag_count_reward/std": 0.1973496377468109, "step": 1400 }, { @@ -40615,27 +40615,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 818.6015625, - "completions/mean_terminated_length": 763.404052734375, - "completions/min_length": 206.0, - "completions/min_terminated_length": 206.0, + "completions/max_terminated_length": 1921.0, + "completions/mean_length": 732.822265625, + "completions/mean_terminated_length": 727.6647338867188, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, "epoch": 0.47827942306051036, - "grad_norm": 1.592006802558899, - "kl": 7.0078125, - "learning_rate": 6.616715704650418e-07, - "loss": 0.4583, - "num_tokens": 792159836.0, - "reward": 1.79052734375, - "reward_std": 0.5385003089904785, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.20327135920524597, + "grad_norm": 4.255768299102783, + "kl": 4.58984375, + "learning_rate": 6.619727835355303e-07, + "loss": 0.2552, + "num_tokens": 859511729.0, + "reward": 0.9423828125, + "reward_std": 0.22777177393436432, + "rewards/accuracy_reward/mean": 0.037109375, + "rewards/accuracy_reward/std": 0.18921469151973724, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9013671875, + "rewards/tag_count_reward/std": 0.20521119236946106, "step": 1401 }, { @@ -40644,27 +40644,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 807.796875, - "completions/mean_terminated_length": 775.4869995117188, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 1512.0, + "completions/mean_length": 730.21484375, + "completions/mean_terminated_length": 727.635986328125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, "epoch": 0.47862080737390117, - "grad_norm": 1.1120721101760864, - "kl": 5.20703125, - "learning_rate": 6.611521532085038e-07, - "loss": 0.3211, - "num_tokens": 792656564.0, - "reward": 1.80224609375, - "reward_std": 0.5459111928939819, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.19778190553188324, + "grad_norm": 2.260495901107788, + "kl": 3.453125, + "learning_rate": 6.614532610821678e-07, + "loss": 0.1826, + "num_tokens": 859968735.0, + "reward": 0.94580078125, + "reward_std": 0.23305454850196838, + "rewards/accuracy_reward/mean": 0.033203125, + "rewards/accuracy_reward/std": 0.17934183776378632, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.90673828125, + "rewards/tag_count_reward/std": 0.19837526977062225, "step": 1402 }, { @@ -40673,27 +40673,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 827.939453125, - "completions/mean_terminated_length": 780.9188232421875, - "completions/min_length": 45.0, - "completions/min_terminated_length": 45.0, + "completions/max_terminated_length": 1588.0, + "completions/mean_length": 760.095703125, + "completions/mean_terminated_length": 752.5049438476562, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.478962191687292, - "grad_norm": 0.9376634955406189, - "kl": 6.22265625, - "learning_rate": 6.606325781918144e-07, - "loss": 0.3956, - "num_tokens": 793160389.0, - "reward": 1.73974609375, - "reward_std": 0.5454986691474915, + "grad_norm": 3.251796007156372, + "kl": 3.30859375, + "learning_rate": 6.609335803212428e-07, + "loss": 0.1824, + "num_tokens": 860437824.0, + "reward": 0.9375, + "reward_std": 0.23910224437713623, "rewards/accuracy_reward/mean": 0.02217741869390011, "rewards/accuracy_reward/std": 0.14740893244743347, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.20302677154541016, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.20233432948589325, "step": 1403 }, { @@ -40702,27 +40702,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 850.501953125, - "completions/mean_terminated_length": 801.8231201171875, - "completions/min_length": 115.0, - "completions/min_terminated_length": 115.0, + "completions/max_terminated_length": 1798.0, + "completions/mean_length": 712.7109375, + "completions/mean_terminated_length": 707.4745483398438, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, "epoch": 0.4793035760006828, - "grad_norm": 1.7362650632858276, - "kl": 6.296875, - "learning_rate": 6.601128461524152e-07, - "loss": 0.4108, - "num_tokens": 793665142.0, - "reward": 1.76904296875, - "reward_std": 0.5725255608558655, - "rewards/accuracy_reward/mean": 0.04435483738780022, - "rewards/accuracy_reward/std": 0.2060900777578354, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.19551756978034973, + "grad_norm": 2.3958215713500977, + "kl": 3.8671875, + "learning_rate": 6.604137419909064e-07, + "loss": 0.1665, + "num_tokens": 860872028.0, + "reward": 0.927734375, + "reward_std": 0.273823618888855, + "rewards/accuracy_reward/mean": 0.03629032149910927, + "rewards/accuracy_reward/std": 0.1872003972530365, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.22087571024894714, "step": 1404 }, { @@ -40731,27 +40731,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 827.376953125, - "completions/mean_terminated_length": 762.0761108398438, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1483.0, + "completions/max_terminated_length": 1483.0, + "completions/mean_length": 727.44140625, + "completions/mean_terminated_length": 727.44140625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.47964496031407355, - "grad_norm": 1.6607929468154907, - "kl": 7.4375, - "learning_rate": 6.595929578279708e-07, - "loss": 0.5082, - "num_tokens": 794168279.0, - "reward": 1.771484375, - "reward_std": 0.5720074772834778, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.8046875, - "rewards/format_reward/std": 0.3968288004398346, - "rewards/tag_count_reward/mean": 0.91015625, - "rewards/tag_count_reward/std": 0.20979996025562286, + "grad_norm": 2.9263436794281006, + "kl": 4.54296875, + "learning_rate": 6.598937468295344e-07, + "loss": 0.2373, + "num_tokens": 861323998.0, + "reward": 0.95751953125, + "reward_std": 0.29104703664779663, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.88525390625, + "rewards/tag_count_reward/std": 0.22315198183059692, "step": 1405 }, { @@ -40760,27 +40760,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 766.435546875, - "completions/mean_terminated_length": 719.7388916015625, - "completions/min_length": 94.0, - "completions/min_terminated_length": 94.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 710.6484375, + "completions/mean_terminated_length": 705.4039916992188, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, "epoch": 0.47998634462746437, - "grad_norm": 3.086946725845337, - "kl": 7.2265625, - "learning_rate": 6.590729139563675e-07, - "loss": 0.4113, - "num_tokens": 794639558.0, - "reward": 1.77392578125, - "reward_std": 0.6171959042549133, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.787109375, - "rewards/format_reward/std": 0.409751296043396, - "rewards/tag_count_reward/mean": 0.89697265625, - "rewards/tag_count_reward/std": 0.22064577043056488, + "grad_norm": 4.403470516204834, + "kl": 4.89453125, + "learning_rate": 6.593735955757246e-07, + "loss": 0.2923, + "num_tokens": 861766714.0, + "reward": 1.046875, + "reward_std": 0.31403157114982605, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.896484375, + "rewards/tag_count_reward/std": 0.21221813559532166, "step": 1406 }, { @@ -40789,27 +40789,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1947.0, - "completions/mean_length": 781.287109375, - "completions/mean_terminated_length": 737.7838745117188, - "completions/min_length": 50.0, - "completions/min_terminated_length": 50.0, + "completions/max_terminated_length": 1874.0, + "completions/mean_length": 732.236328125, + "completions/mean_terminated_length": 724.4813842773438, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.4803277289408552, - "grad_norm": 1.5814425945281982, - "kl": 7.203125, - "learning_rate": 6.585527152757128e-07, - "loss": 0.4377, - "num_tokens": 795114585.0, - "reward": 1.81787109375, - "reward_std": 0.5788640379905701, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.20743593573570251, + "grad_norm": 4.243809700012207, + "kl": 4.13671875, + "learning_rate": 6.58853288968297e-07, + "loss": 0.2057, + "num_tokens": 862216627.0, + "reward": 0.94384765625, + "reward_std": 0.2750888168811798, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.88720703125, + "rewards/tag_count_reward/std": 0.22084921598434448, "step": 1407 }, { @@ -40818,27 +40818,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1949.0, - "completions/mean_length": 725.12109375, - "completions/mean_terminated_length": 676.9190673828125, - "completions/min_length": 52.0, - "completions/min_terminated_length": 52.0, + "completions/max_terminated_length": 1796.0, + "completions/mean_length": 666.966796875, + "completions/mean_terminated_length": 664.26416015625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, "epoch": 0.480669113254246, - "grad_norm": 2.2530622482299805, - "kl": 6.734375, - "learning_rate": 6.580323625243332e-07, - "loss": 0.4232, - "num_tokens": 795561687.0, - "reward": 1.81689453125, - "reward_std": 0.5578827261924744, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.20260746777057648, + "grad_norm": 1.7676359415054321, + "kl": 3.15234375, + "learning_rate": 6.583328277462919e-07, + "loss": 0.1922, + "num_tokens": 862633954.0, + "reward": 0.98828125, + "reward_std": 0.24938128888607025, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.19499453902244568, "step": 1408 }, { @@ -40847,27 +40847,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1962.0, - "completions/mean_length": 718.115234375, - "completions/mean_terminated_length": 686.1980590820312, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1976.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 677.7734375, + "completions/mean_terminated_length": 677.7734375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.48101049756763675, - "grad_norm": 2.0103352069854736, - "kl": 5.1796875, - "learning_rate": 6.575118564407742e-07, - "loss": 0.3535, - "num_tokens": 796008514.0, - "reward": 1.9111328125, - "reward_std": 0.49185386300086975, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.9462890625, - "rewards/tag_count_reward/std": 0.15418460965156555, + "grad_norm": 3.514169692993164, + "kl": 3.08203125, + "learning_rate": 6.578122126489696e-07, + "loss": 0.1818, + "num_tokens": 863060126.0, + "reward": 1.01953125, + "reward_std": 0.2820933759212494, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.1947886198759079, "step": 1409 }, { @@ -40876,27 +40876,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 759.279296875, - "completions/mean_terminated_length": 725.7054443359375, - "completions/min_length": 55.0, - "completions/min_terminated_length": 55.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1520.0, + "completions/max_terminated_length": 1520.0, + "completions/mean_length": 667.751953125, + "completions/mean_terminated_length": 667.751953125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, "epoch": 0.48135188188102757, - "grad_norm": 2.951759099960327, - "kl": 6.1328125, - "learning_rate": 6.569911977637994e-07, - "loss": 0.4127, - "num_tokens": 796480241.0, - "reward": 1.82080078125, - "reward_std": 0.5672469139099121, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.20031657814979553, + "grad_norm": 2.8376305103302, + "kl": 3.21484375, + "learning_rate": 6.572914444158084e-07, + "loss": 0.1761, + "num_tokens": 863484991.0, + "reward": 0.96142578125, + "reward_std": 0.2653089165687561, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.90283203125, + "rewards/tag_count_reward/std": 0.20079778134822845, "step": 1410 }, { @@ -40905,27 +40905,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 804.99609375, - "completions/mean_terminated_length": 749.187744140625, - "completions/min_length": 60.0, - "completions/min_terminated_length": 60.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 720.078125, + "completions/mean_terminated_length": 717.4794311523438, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, "epoch": 0.4816932661944184, - "grad_norm": 2.1062204837799072, - "kl": 7.015625, - "learning_rate": 6.564703872323883e-07, - "loss": 0.4235, - "num_tokens": 796975519.0, - "reward": 1.806640625, - "reward_std": 0.5600014925003052, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.19951897859573364, + "grad_norm": 2.4625940322875977, + "kl": 3.59375, + "learning_rate": 6.567705237865049e-07, + "loss": 0.2191, + "num_tokens": 863936791.0, + "reward": 0.98681640625, + "reward_std": 0.2922849953174591, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.89697265625, + "rewards/tag_count_reward/std": 0.21331787109375, "step": 1411 }, { @@ -40934,27 +40934,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 775.48828125, - "completions/mean_terminated_length": 739.71484375, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 726.861328125, + "completions/mean_terminated_length": 719.07470703125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.4820346505078092, - "grad_norm": 1.2882215976715088, - "kl": 5.541015625, - "learning_rate": 6.559494255857362e-07, - "loss": 0.3697, - "num_tokens": 797452313.0, - "reward": 1.8349609375, - "reward_std": 0.4919002652168274, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.185324028134346, + "grad_norm": 2.6543076038360596, + "kl": 2.921875, + "learning_rate": 6.562494515009715e-07, + "loss": 0.1735, + "num_tokens": 864388688.0, + "reward": 0.9921875, + "reward_std": 0.29198989272117615, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.1958160698413849, "step": 1412 }, { @@ -40963,27 +40963,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 717.904296875, - "completions/mean_terminated_length": 694.1053466796875, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1742.0, + "completions/max_terminated_length": 1742.0, + "completions/mean_length": 669.189453125, + "completions/mean_terminated_length": 669.189453125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, "epoch": 0.48237603482119995, - "grad_norm": 2.3797285556793213, - "kl": 5.15625, - "learning_rate": 6.554283135632529e-07, - "loss": 0.3489, - "num_tokens": 797892824.0, - "reward": 1.87841796875, - "reward_std": 0.5279759764671326, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.17642973363399506, + "grad_norm": 2.486616849899292, + "kl": 2.232421875, + "learning_rate": 6.557282282993363e-07, + "loss": 0.1008, + "num_tokens": 864804257.0, + "reward": 1.0029296875, + "reward_std": 0.23314526677131653, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.1702537089586258, "step": 1413 }, { @@ -40992,27 +40992,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 779.306640625, - "completions/mean_terminated_length": 738.3810424804688, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1617.0, + "completions/max_terminated_length": 1617.0, + "completions/mean_length": 691.57421875, + "completions/mean_terminated_length": 691.57421875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.48271741913459076, - "grad_norm": 1.7999489307403564, - "kl": 6.3046875, - "learning_rate": 6.549070519045615e-07, - "loss": 0.4473, - "num_tokens": 798368725.0, - "reward": 1.8359375, - "reward_std": 0.4930545687675476, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.17969314754009247, + "grad_norm": 2.9065868854522705, + "kl": 2.689453125, + "learning_rate": 6.552068549219415e-07, + "loss": 0.1343, + "num_tokens": 865235239.0, + "reward": 0.9453125, + "reward_std": 0.24875755608081818, + "rewards/accuracy_reward/mean": 0.033203125, + "rewards/accuracy_reward/std": 0.17934183776378632, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.20301324129104614, "step": 1414 }, { @@ -41021,27 +41021,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1748.0, - "completions/mean_length": 686.099609375, - "completions/mean_terminated_length": 650.6192626953125, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 668.033203125, + "completions/mean_terminated_length": 662.62158203125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, "epoch": 0.4830588034479816, - "grad_norm": 1.749036431312561, - "kl": 5.484375, - "learning_rate": 6.543856413494979e-07, - "loss": 0.3711, - "num_tokens": 798794648.0, - "reward": 1.91064453125, - "reward_std": 0.4636210799217224, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.94970703125, - "rewards/tag_count_reward/std": 0.1678251326084137, + "grad_norm": 4.7207536697387695, + "kl": 2.2578125, + "learning_rate": 6.546853321093429e-07, + "loss": 0.1539, + "num_tokens": 865651912.0, + "reward": 1.0244140625, + "reward_std": 0.2378767430782318, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.17239542305469513, "step": 1415 }, { @@ -41050,27 +41050,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1960.0, - "completions/mean_length": 725.783203125, - "completions/mean_terminated_length": 694.050048828125, - "completions/min_length": 69.0, - "completions/min_terminated_length": 69.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1557.0, + "completions/max_terminated_length": 1557.0, + "completions/mean_length": 683.82421875, + "completions/mean_terminated_length": 683.82421875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.4834001877613724, - "grad_norm": 1.747126817703247, - "kl": 4.9921875, - "learning_rate": 6.538640826381086e-07, - "loss": 0.3184, - "num_tokens": 799245401.0, - "reward": 1.8642578125, - "reward_std": 0.4920358955860138, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.16953378915786743, + "grad_norm": 2.8659422397613525, + "kl": 2.05078125, + "learning_rate": 6.541636606023086e-07, + "loss": 0.1104, + "num_tokens": 866081182.0, + "reward": 1.068359375, + "reward_std": 0.2919410765171051, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.17615941166877747, "step": 1416 }, { @@ -41079,27 +41079,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 717.318359375, - "completions/mean_terminated_length": 682.6513061523438, - "completions/min_length": 58.0, - "completions/min_terminated_length": 58.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 719.9140625, + "completions/mean_terminated_length": 717.3150634765625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.48374157207476315, - "grad_norm": 1.6827846765518188, - "kl": 6.3671875, - "learning_rate": 6.533423765106509e-07, - "loss": 0.3959, - "num_tokens": 799687372.0, - "reward": 1.81005859375, - "reward_std": 0.5226245522499084, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.1888652741909027, + "grad_norm": 2.661656379699707, + "kl": 2.29296875, + "learning_rate": 6.536418411418176e-07, + "loss": 0.1255, + "num_tokens": 866524482.0, + "reward": 1.02587890625, + "reward_std": 0.2767181992530823, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.91064453125, + "rewards/tag_count_reward/std": 0.19894284009933472, "step": 1417 }, { @@ -41108,27 +41108,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 671.201171875, - "completions/mean_terminated_length": 654.8755493164062, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 1613.0, + "completions/mean_length": 680.328125, + "completions/mean_terminated_length": 677.6516723632812, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, "epoch": 0.48408295638815396, - "grad_norm": 2.371227264404297, - "kl": 6.671875, - "learning_rate": 6.528205237075916e-07, - "loss": 0.382, - "num_tokens": 800112291.0, - "reward": 1.85986328125, - "reward_std": 0.532575249671936, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.17183120548725128, + "grad_norm": 3.6637697219848633, + "kl": 2.44921875, + "learning_rate": 6.531198744690596e-07, + "loss": 0.1373, + "num_tokens": 866954074.0, + "reward": 1.0380859375, + "reward_std": 0.2919238805770874, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.17795921862125397, "step": 1418 }, { @@ -41137,27 +41137,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1898.0, - "completions/mean_length": 730.56640625, - "completions/mean_terminated_length": 696.2445068359375, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1556.0, + "completions/max_terminated_length": 1556.0, + "completions/mean_length": 702.30078125, + "completions/mean_terminated_length": 702.30078125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.4844243407015448, - "grad_norm": 3.0626814365386963, - "kl": 6.453125, - "learning_rate": 6.522985249696049e-07, - "loss": 0.3758, - "num_tokens": 800560037.0, - "reward": 1.892578125, - "reward_std": 0.5158536434173584, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.1876731961965561, + "grad_norm": 2.332885265350342, + "kl": 2.56640625, + "learning_rate": 6.525977613254326e-07, + "loss": 0.1373, + "num_tokens": 867387348.0, + "reward": 1.0224609375, + "reward_std": 0.26636356115341187, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.1879250556230545, "step": 1419 }, { @@ -41166,27 +41166,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 754.92578125, - "completions/mean_terminated_length": 710.5172119140625, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1872.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 643.779296875, + "completions/mean_terminated_length": 643.779296875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.4847657250149356, - "grad_norm": 3.68796706199646, - "kl": 8.90625, - "learning_rate": 6.517763810375727e-07, - "loss": 0.5475, - "num_tokens": 801022831.0, - "reward": 1.84814453125, - "reward_std": 0.5662318468093872, - "rewards/accuracy_reward/mean": 0.07661290466785431, - "rewards/accuracy_reward/std": 0.2662447690963745, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.1911684274673462, + "grad_norm": 2.5800259113311768, + "kl": 3.81640625, + "learning_rate": 6.520755024525431e-07, + "loss": 0.2089, + "num_tokens": 867793235.0, + "reward": 1.0185546875, + "reward_std": 0.27083826065063477, + "rewards/accuracy_reward/mean": 0.10080645233392715, + "rewards/accuracy_reward/std": 0.30137622356414795, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.1914113163948059, "step": 1420 }, { @@ -41195,27 +41195,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1918.0, - "completions/mean_length": 709.62109375, - "completions/mean_terminated_length": 677.5000610351562, - "completions/min_length": 72.0, - "completions/min_terminated_length": 72.0, + "completions/max_terminated_length": 1669.0, + "completions/mean_length": 649.162109375, + "completions/mean_terminated_length": 646.4246826171875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.48510710932832635, - "grad_norm": 1.6579664945602417, - "kl": 6.4453125, - "learning_rate": 6.512540926525828e-07, - "loss": 0.3901, - "num_tokens": 801452461.0, - "reward": 1.92333984375, - "reward_std": 0.5349385142326355, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.18389739096164703, + "grad_norm": 5.582352161407471, + "kl": 4.46484375, + "learning_rate": 6.515530985922047e-07, + "loss": 0.225, + "num_tokens": 868191910.0, + "reward": 1.02880859375, + "reward_std": 0.29447507858276367, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.20726542174816132, "step": 1421 }, { @@ -41224,27 +41224,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 756.48828125, - "completions/mean_terminated_length": 720.1807250976562, - "completions/min_length": 233.0, - "completions/min_terminated_length": 233.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1666.0, + "completions/max_terminated_length": 1666.0, + "completions/mean_length": 672.677734375, + "completions/mean_terminated_length": 672.677734375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, "epoch": 0.48544849364171716, - "grad_norm": 1.7433476448059082, - "kl": 6.7890625, - "learning_rate": 6.507316605559281e-07, - "loss": 0.4027, - "num_tokens": 801916567.0, - "reward": 1.82568359375, - "reward_std": 0.5088048577308655, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.18511444330215454, + "grad_norm": 2.7323851585388184, + "kl": 3.421875, + "learning_rate": 6.510305504864369e-07, + "loss": 0.1899, + "num_tokens": 868613105.0, + "reward": 0.95751953125, + "reward_std": 0.2595144808292389, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90869140625, + "rewards/tag_count_reward/std": 0.1949400156736374, "step": 1422 }, { @@ -41253,27 +41253,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 739.330078125, - "completions/mean_terminated_length": 715.9144897460938, - "completions/min_length": 192.0, - "completions/min_terminated_length": 192.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1384.0, + "completions/max_terminated_length": 1384.0, + "completions/mean_length": 639.861328125, + "completions/mean_terminated_length": 639.861328125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.485789877955108, - "grad_norm": 3.603640079498291, - "kl": 4.7109375, - "learning_rate": 6.502090854891051e-07, - "loss": 0.335, - "num_tokens": 802367072.0, - "reward": 1.931640625, - "reward_std": 0.48461073637008667, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.953125, - "rewards/tag_count_reward/std": 0.153242826461792, + "grad_norm": 5.383285045623779, + "kl": 4.140625, + "learning_rate": 6.505078588774637e-07, + "loss": 0.2095, + "num_tokens": 869012682.0, + "reward": 0.99560546875, + "reward_std": 0.2752225995063782, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.19466042518615723, "step": 1423 }, { @@ -41282,27 +41282,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 789.9375, - "completions/mean_terminated_length": 754.5702514648438, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1605.0, + "completions/max_terminated_length": 1605.0, + "completions/mean_length": 685.84375, + "completions/mean_terminated_length": 685.84375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.4861312622684988, - "grad_norm": 1.4389578104019165, - "kl": 6.1171875, - "learning_rate": 6.496863681938138e-07, - "loss": 0.3802, - "num_tokens": 802846896.0, - "reward": 1.7919921875, - "reward_std": 0.535548985004425, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.19179034233093262, + "grad_norm": 7.578466415405273, + "kl": 3.921875, + "learning_rate": 6.499850245077135e-07, + "loss": 0.1856, + "num_tokens": 869439210.0, + "reward": 1.01025390625, + "reward_std": 0.2748653292655945, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.90869140625, + "rewards/tag_count_reward/std": 0.19368110597133636, "step": 1424 }, { @@ -41311,27 +41311,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1757.0, - "completions/mean_length": 773.16015625, - "completions/mean_terminated_length": 734.68408203125, - "completions/min_length": 124.0, - "completions/min_terminated_length": 124.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1471.0, + "completions/max_terminated_length": 1471.0, + "completions/mean_length": 642.060546875, + "completions/mean_terminated_length": 642.060546875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, "epoch": 0.48647264658188955, - "grad_norm": 1.6272501945495605, - "kl": 5.1328125, - "learning_rate": 6.491635094119558e-07, - "loss": 0.3192, - "num_tokens": 803316226.0, - "reward": 1.904296875, - "reward_std": 0.5398890972137451, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310528099536896, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.1823011338710785, + "grad_norm": 5.208425045013428, + "kl": 3.5859375, + "learning_rate": 6.494620481198169e-07, + "loss": 0.2107, + "num_tokens": 869841417.0, + "reward": 1.02099609375, + "reward_std": 0.28070178627967834, + "rewards/accuracy_reward/mean": 0.11895161122083664, + "rewards/accuracy_reward/std": 0.3240584135055542, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.20488207042217255, "step": 1425 }, { @@ -41340,27 +41340,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 775.666015625, - "completions/mean_terminated_length": 737.2655639648438, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1535.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 642.255859375, + "completions/mean_terminated_length": 642.255859375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, "epoch": 0.48681403089528036, - "grad_norm": 1.0159920454025269, - "kl": 6.25, - "learning_rate": 6.486405098856333e-07, - "loss": 0.3864, - "num_tokens": 803787991.0, - "reward": 1.79345703125, - "reward_std": 0.531171441078186, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.19348366558551788, + "grad_norm": 2.937472343444824, + "kl": 3.33984375, + "learning_rate": 6.489389304566067e-07, + "loss": 0.1919, + "num_tokens": 870244876.0, + "reward": 0.9892578125, + "reward_std": 0.24509309232234955, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.17922191321849823, "step": 1426 }, { @@ -41369,27 +41369,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1935.0, - "completions/mean_length": 740.220703125, - "completions/mean_terminated_length": 706.1503295898438, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1601.0, + "completions/max_terminated_length": 1601.0, + "completions/mean_length": 616.078125, + "completions/mean_terminated_length": 616.078125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, "epoch": 0.48715541520867117, - "grad_norm": 0.8419586420059204, - "kl": 6.4765625, - "learning_rate": 6.481173703571487e-07, - "loss": 0.4009, - "num_tokens": 804246792.0, - "reward": 1.8125, - "reward_std": 0.5184415578842163, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.17828376591205597, + "grad_norm": 4.275766849517822, + "kl": 3.0078125, + "learning_rate": 6.484156722611161e-07, + "loss": 0.213, + "num_tokens": 870640116.0, + "reward": 0.9794921875, + "reward_std": 0.22628310322761536, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.1772274672985077, "step": 1427 }, { @@ -41398,27 +41398,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 835.873046875, - "completions/mean_terminated_length": 789.158203125, - "completions/min_length": 72.0, - "completions/min_terminated_length": 72.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1525.0, + "completions/max_terminated_length": 1525.0, + "completions/mean_length": 660.748046875, + "completions/mean_terminated_length": 660.748046875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, "epoch": 0.487496799522062, - "grad_norm": 2.0123610496520996, - "kl": 6.5625, - "learning_rate": 6.475940915690028e-07, - "loss": 0.3536, - "num_tokens": 804751847.0, - "reward": 1.79638671875, - "reward_std": 0.5437809824943542, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.1927117109298706, + "grad_norm": 3.146787643432617, + "kl": 2.54296875, + "learning_rate": 6.478922742765782e-07, + "loss": 0.1472, + "num_tokens": 871055507.0, + "reward": 0.97998046875, + "reward_std": 0.2378777414560318, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.1861388236284256, "step": 1428 }, { @@ -41427,27 +41427,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 807.216796875, - "completions/mean_terminated_length": 764.6040649414062, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1694.0, + "completions/max_terminated_length": 1694.0, + "completions/mean_length": 635.7890625, + "completions/mean_terminated_length": 635.7890625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, "epoch": 0.48783818383545274, - "grad_norm": 3.6934638023376465, - "kl": 7.0625, - "learning_rate": 6.470706742638942e-07, - "loss": 0.4154, - "num_tokens": 805246662.0, - "reward": 1.78173828125, - "reward_std": 0.5377598404884338, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.19903406500816345, + "grad_norm": 2.971524953842163, + "kl": 2.419921875, + "learning_rate": 6.473687372464243e-07, + "loss": 0.1315, + "num_tokens": 871462551.0, + "reward": 1.01953125, + "reward_std": 0.24275973439216614, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.1719430834054947, "step": 1429 }, { @@ -41456,27 +41456,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 843.90625, - "completions/mean_terminated_length": 784.6884765625, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1387.0, + "completions/max_terminated_length": 1387.0, + "completions/mean_length": 659.9140625, + "completions/mean_terminated_length": 659.9140625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.48817956814884356, - "grad_norm": 3.9663517475128174, - "kl": 8.9609375, - "learning_rate": 6.465471191847177e-07, - "loss": 0.5069, - "num_tokens": 805758886.0, - "reward": 1.7236328125, - "reward_std": 0.6423845291137695, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.765625, - "rewards/format_reward/std": 0.42402184009552, - "rewards/tag_count_reward/mean": 0.8935546875, - "rewards/tag_count_reward/std": 0.22588695585727692, + "grad_norm": 6.472031116485596, + "kl": 2.0859375, + "learning_rate": 6.468450619142831e-07, + "loss": 0.115, + "num_tokens": 871880571.0, + "reward": 1.01416015625, + "reward_std": 0.26881539821624756, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.1780041754245758, "step": 1430 }, { @@ -41485,27 +41485,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 840.4375, - "completions/mean_terminated_length": 796.437255859375, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1564.0, + "completions/max_terminated_length": 1564.0, + "completions/mean_length": 648.427734375, + "completions/mean_terminated_length": 648.427734375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, "epoch": 0.48852095246223437, - "grad_norm": 0.9592890739440918, - "kl": 6.546875, - "learning_rate": 6.460234270745645e-07, - "loss": 0.3858, - "num_tokens": 806264070.0, - "reward": 1.84912109375, - "reward_std": 0.5959770679473877, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.19561529159545898, + "grad_norm": 2.4579715728759766, + "kl": 2.2109375, + "learning_rate": 6.463212490239804e-07, + "loss": 0.1139, + "num_tokens": 872287446.0, + "reward": 1.08740234375, + "reward_std": 0.2898830771446228, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.16781944036483765, "step": 1431 }, { @@ -41514,27 +41514,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1957.0, - "completions/mean_length": 857.744140625, - "completions/mean_terminated_length": 796.6427612304688, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_terminated_length": 1378.0, + "completions/mean_length": 657.15625, + "completions/mean_terminated_length": 651.7020263671875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.4888623367756252, - "grad_norm": 1.1903507709503174, - "kl": 7.109375, - "learning_rate": 6.454995986767193e-07, - "loss": 0.3926, - "num_tokens": 806788211.0, - "reward": 1.70947265625, - "reward_std": 0.5695422887802124, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.78125, - "rewards/format_reward/std": 0.41380295157432556, - "rewards/tag_count_reward/mean": 0.89892578125, - "rewards/tag_count_reward/std": 0.22319906949996948, + "grad_norm": 6.283302307128906, + "kl": 2.71484375, + "learning_rate": 6.457972993195369e-07, + "loss": 0.1931, + "num_tokens": 872708886.0, + "reward": 0.986328125, + "reward_std": 0.22939418256282806, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.19066354632377625, "step": 1432 }, { @@ -41543,27 +41543,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1953.0, - "completions/mean_length": 853.853515625, - "completions/mean_terminated_length": 802.7800903320312, - "completions/min_length": 199.0, - "completions/min_terminated_length": 199.0, + "completions/max_terminated_length": 1702.0, + "completions/mean_length": 634.87109375, + "completions/mean_terminated_length": 629.3294677734375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.48920372108901594, - "grad_norm": 1.5274327993392944, - "kl": 5.78125, - "learning_rate": 6.44975634734661e-07, - "loss": 0.3492, - "num_tokens": 807299704.0, - "reward": 1.85107421875, - "reward_std": 0.6022671461105347, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.19277119636535645, + "grad_norm": 2.11894154548645, + "kl": 2.52734375, + "learning_rate": 6.452732135451674e-07, + "loss": 0.1314, + "num_tokens": 873108260.0, + "reward": 1.07177734375, + "reward_std": 0.2627972960472107, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.16120965778827667, "step": 1433 }, { @@ -41572,27 +41572,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 851.294921875, - "completions/mean_terminated_length": 812.6915283203125, - "completions/min_length": 89.0, - "completions/min_terminated_length": 89.0, + "completions/max_terminated_length": 1546.0, + "completions/mean_length": 662.576171875, + "completions/mean_terminated_length": 659.864990234375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.48954510540240675, - "grad_norm": 1.284857153892517, - "kl": 5.73046875, - "learning_rate": 6.444515359920605e-07, - "loss": 0.3554, - "num_tokens": 807812031.0, - "reward": 1.76708984375, - "reward_std": 0.5164639949798584, - "rewards/accuracy_reward/mean": 0.038306452333927155, - "rewards/accuracy_reward/std": 0.19212883710861206, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.19510170817375183, + "grad_norm": 2.6592986583709717, + "kl": 3.5390625, + "learning_rate": 6.447489924452806e-07, + "loss": 0.1967, + "num_tokens": 873523963.0, + "reward": 0.99462890625, + "reward_std": 0.24731838703155518, + "rewards/accuracy_reward/mean": 0.06854838877916336, + "rewards/accuracy_reward/std": 0.25293970108032227, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.18640044331550598, "step": 1434 }, { @@ -41601,27 +41601,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 874.740234375, - "completions/mean_terminated_length": 824.5601196289062, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/max_terminated_length": 1362.0, + "completions/mean_length": 656.595703125, + "completions/mean_terminated_length": 648.3948974609375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, "epoch": 0.48988648971579757, - "grad_norm": 1.281517505645752, - "kl": 6.0, - "learning_rate": 6.439273031927801e-07, - "loss": 0.3561, - "num_tokens": 808335802.0, - "reward": 1.80078125, - "reward_std": 0.5718402862548828, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.8046875, - "rewards/format_reward/std": 0.3968288004398346, - "rewards/tag_count_reward/mean": 0.912109375, - "rewards/tag_count_reward/std": 0.20473802089691162, + "grad_norm": 3.1901962757110596, + "kl": 3.4765625, + "learning_rate": 6.442246367644769e-07, + "loss": 0.1879, + "num_tokens": 873936044.0, + "reward": 1.021484375, + "reward_std": 0.2841264605522156, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.189265176653862, "step": 1435 }, { @@ -41630,27 +41630,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 910.60546875, - "completions/mean_terminated_length": 861.9592895507812, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 667.998046875, + "completions/mean_terminated_length": 662.5863037109375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, "epoch": 0.4902278740291884, - "grad_norm": 1.5556361675262451, - "kl": 5.3046875, - "learning_rate": 6.434029370808722e-07, - "loss": 0.3071, - "num_tokens": 808898704.0, - "reward": 1.74560546875, - "reward_std": 0.6147333383560181, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.79296875, - "rewards/format_reward/std": 0.40557438135147095, - "rewards/tag_count_reward/mean": 0.89794921875, - "rewards/tag_count_reward/std": 0.22220362722873688, + "grad_norm": 4.424574851989746, + "kl": 3.609375, + "learning_rate": 6.437001472475482e-07, + "loss": 0.185, + "num_tokens": 874374731.0, + "reward": 1.0400390625, + "reward_std": 0.2838236093521118, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.1772274672985077, "step": 1436 }, { @@ -41659,27 +41659,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.068359375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 883.8984375, - "completions/mean_terminated_length": 798.482177734375, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/max_terminated_length": 1392.0, + "completions/mean_length": 685.525390625, + "completions/mean_terminated_length": 682.8590698242188, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, "epoch": 0.49056925834257914, - "grad_norm": 2.6624701023101807, - "kl": 7.65625, - "learning_rate": 6.428784384005789e-07, - "loss": 0.4915, - "num_tokens": 809430764.0, - "reward": 1.7841796875, - "reward_std": 0.6037580966949463, - "rewards/accuracy_reward/mean": 0.10282257944345474, - "rewards/accuracy_reward/std": 0.30403366684913635, - "rewards/format_reward/mean": 0.79296875, - "rewards/format_reward/std": 0.40557438135147095, - "rewards/tag_count_reward/mean": 0.8916015625, - "rewards/tag_count_reward/std": 0.23191487789154053, + "grad_norm": 3.477193593978882, + "kl": 2.783203125, + "learning_rate": 6.431755246394763e-07, + "loss": 0.1606, + "num_tokens": 874805224.0, + "reward": 1.04443359375, + "reward_std": 0.23347032070159912, + "rewards/accuracy_reward/mean": 0.10685484111309052, + "rewards/accuracy_reward/std": 0.30924052000045776, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.1671748012304306, "step": 1437 }, { @@ -41688,27 +41688,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0859375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 1002.501953125, - "completions/mean_terminated_length": 904.2073364257812, - "completions/min_length": 223.0, - "completions/min_terminated_length": 223.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 740.4921875, + "completions/mean_terminated_length": 737.9334716796875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.49091064265596995, - "grad_norm": 1.8498705625534058, - "kl": 7.9140625, - "learning_rate": 6.423538078963299e-07, - "loss": 0.4687, - "num_tokens": 810020845.0, - "reward": 1.6845703125, - "reward_std": 0.6515914797782898, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.751953125, - "rewards/format_reward/std": 0.4323015511035919, - "rewards/tag_count_reward/mean": 0.8740234375, - "rewards/tag_count_reward/std": 0.2402685582637787, + "grad_norm": 3.894792079925537, + "kl": 3.140625, + "learning_rate": 6.426507696854321e-07, + "loss": 0.1533, + "num_tokens": 875261156.0, + "reward": 1.0, + "reward_std": 0.2633817195892334, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.19436629116535187, "step": 1438 }, { @@ -41717,27 +41717,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1969.0, - "completions/mean_length": 883.509765625, - "completions/mean_terminated_length": 805.8771362304688, - "completions/min_length": 193.0, - "completions/min_terminated_length": 193.0, + "completions/max_terminated_length": 1813.0, + "completions/mean_length": 690.013671875, + "completions/mean_terminated_length": 687.3561401367188, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.49125202696936077, - "grad_norm": 1.4366450309753418, - "kl": 6.3203125, - "learning_rate": 6.418290463127423e-07, - "loss": 0.4229, - "num_tokens": 810556290.0, - "reward": 1.8056640625, - "reward_std": 0.617745041847229, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.79296875, - "rewards/format_reward/std": 0.40557438135147095, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.21948480606079102, + "grad_norm": 4.6422834396362305, + "kl": 2.69140625, + "learning_rate": 6.421258831307744e-07, + "loss": 0.122, + "num_tokens": 875697531.0, + "reward": 1.09765625, + "reward_std": 0.30944955348968506, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.1748199313879013, "step": 1439 }, { @@ -41746,27 +41746,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 876.6015625, - "completions/mean_terminated_length": 821.5050659179688, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1704.0, + "completions/max_terminated_length": 1704.0, + "completions/mean_length": 689.810546875, + "completions/mean_terminated_length": 689.810546875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, "epoch": 0.4915934112827516, - "grad_norm": 1.4568078517913818, - "kl": 4.9609375, - "learning_rate": 6.413041543946192e-07, - "loss": 0.3174, - "num_tokens": 811087878.0, - "reward": 1.74755859375, - "reward_std": 0.5549442172050476, - "rewards/accuracy_reward/mean": 0.03427419438958168, - "rewards/accuracy_reward/std": 0.18211629986763, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.2048913985490799, + "grad_norm": 4.3183698654174805, + "kl": 2.255859375, + "learning_rate": 6.416008657210492e-07, + "loss": 0.1091, + "num_tokens": 876133482.0, + "reward": 0.99755859375, + "reward_std": 0.2176560014486313, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24230584502220154, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.17291219532489777, "step": 1440 }, { @@ -41775,27 +41775,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 892.8046875, - "completions/mean_terminated_length": 831.0040893554688, - "completions/min_length": 59.0, - "completions/min_terminated_length": 59.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1722.0, + "completions/max_terminated_length": 1722.0, + "completions/mean_length": 663.212890625, + "completions/mean_terminated_length": 663.212890625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, "epoch": 0.49193479559614234, - "grad_norm": 2.4246034622192383, - "kl": 6.1875, - "learning_rate": 6.407791328869488e-07, - "loss": 0.3727, - "num_tokens": 811626754.0, - "reward": 1.68408203125, - "reward_std": 0.637832760810852, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.736328125, - "rewards/format_reward/std": 0.4410543739795685, - "rewards/tag_count_reward/mean": 0.88525390625, - "rewards/tag_count_reward/std": 0.21928171813488007, + "grad_norm": 2.023632287979126, + "kl": 2.125, + "learning_rate": 6.41075718201988e-07, + "loss": 0.113, + "num_tokens": 876554807.0, + "reward": 1.03955078125, + "reward_std": 0.2606458067893982, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.16826865077018738, "step": 1441 }, { @@ -41804,27 +41804,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 860.3671875, - "completions/mean_terminated_length": 817.0931396484375, - "completions/min_length": 115.0, - "completions/min_terminated_length": 115.0, + "completions/max_terminated_length": 1888.0, + "completions/mean_length": 682.365234375, + "completions/mean_terminated_length": 677.0098266601562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, "epoch": 0.49227617990953315, - "grad_norm": 1.6708537340164185, - "kl": 5.96484375, - "learning_rate": 6.402539825349032e-07, - "loss": 0.3627, - "num_tokens": 812145854.0, - "reward": 1.7451171875, - "reward_std": 0.6325790286064148, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.767578125, - "rewards/format_reward/std": 0.42278963327407837, - "rewards/tag_count_reward/mean": 0.8955078125, - "rewards/tag_count_reward/std": 0.21346396207809448, + "grad_norm": 2.9873735904693604, + "kl": 3.34375, + "learning_rate": 6.405504413195072e-07, + "loss": 0.1475, + "num_tokens": 876982770.0, + "reward": 1.06005859375, + "reward_std": 0.2674265503883362, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.17769792675971985, "step": 1442 }, { @@ -41833,27 +41833,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 859.701171875, - "completions/mean_terminated_length": 808.8778686523438, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1546.0, + "completions/max_terminated_length": 1546.0, + "completions/mean_length": 712.302734375, + "completions/mean_terminated_length": 712.302734375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.49261756422292396, - "grad_norm": 1.6362295150756836, - "kl": 5.0234375, - "learning_rate": 6.397287040838367e-07, - "loss": 0.2981, - "num_tokens": 812658821.0, - "reward": 1.8720703125, - "reward_std": 0.6203747987747192, - "rewards/accuracy_reward/mean": 0.142578125, - "rewards/accuracy_reward/std": 0.3499840497970581, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.20074841380119324, + "grad_norm": 1.8084334135055542, + "kl": 3.16015625, + "learning_rate": 6.400250358197072e-07, + "loss": 0.1407, + "num_tokens": 877420269.0, + "reward": 1.095703125, + "reward_std": 0.3144042491912842, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.1862010508775711, "step": 1443 }, { @@ -41862,27 +41862,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 860.16796875, - "completions/mean_terminated_length": 799.1909790039062, - "completions/min_length": 193.0, - "completions/min_terminated_length": 193.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1450.0, + "completions/max_terminated_length": 1450.0, + "completions/mean_length": 665.076171875, + "completions/mean_terminated_length": 665.076171875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.4929589485363148, - "grad_norm": 2.229505777359009, - "kl": 5.890625, - "learning_rate": 6.392032982792865e-07, - "loss": 0.3685, - "num_tokens": 813171643.0, - "reward": 1.728515625, - "reward_std": 0.5821982622146606, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.77734375, - "rewards/format_reward/std": 0.41643625497817993, - "rewards/tag_count_reward/mean": 0.8984375, - "rewards/tag_count_reward/std": 0.20497123897075653, + "grad_norm": 1.6828752756118774, + "kl": 2.341796875, + "learning_rate": 6.39499502448871e-07, + "loss": 0.0777, + "num_tokens": 877833204.0, + "reward": 1.0166015625, + "reward_std": 0.24552182853221893, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.16438288986682892, "step": 1444 }, { @@ -41891,27 +41891,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 884.6328125, - "completions/mean_terminated_length": 807.0750122070312, - "completions/min_length": 69.0, - "completions/min_terminated_length": 69.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1804.0, + "completions/max_terminated_length": 1804.0, + "completions/mean_length": 706.244140625, + "completions/mean_terminated_length": 706.244140625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.49330033284970554, - "grad_norm": 1.5415078401565552, - "kl": 6.046875, - "learning_rate": 6.386777658669698e-07, - "loss": 0.3889, - "num_tokens": 813709199.0, - "reward": 1.7314453125, - "reward_std": 0.584844172000885, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.77734375, - "rewards/format_reward/std": 0.41643625497817993, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.2166806012392044, + "grad_norm": 1.7896771430969238, + "kl": 2.09765625, + "learning_rate": 6.389738419534628e-07, + "loss": 0.1076, + "num_tokens": 878279425.0, + "reward": 1.05224609375, + "reward_std": 0.22195741534233093, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.95068359375, + "rewards/tag_count_reward/std": 0.15367169678211212, "step": 1445 }, { @@ -41920,27 +41920,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 882.5390625, - "completions/mean_terminated_length": 837.6226806640625, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 1694.0, + "completions/mean_length": 719.84765625, + "completions/mean_terminated_length": 717.24853515625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.49364171716309635, - "grad_norm": 1.8591444492340088, - "kl": 4.578125, - "learning_rate": 6.38152107592784e-07, - "loss": 0.3006, - "num_tokens": 814235235.0, - "reward": 1.87451171875, - "reward_std": 0.5980396270751953, - "rewards/accuracy_reward/mean": 0.14516128599643707, - "rewards/accuracy_reward/std": 0.3526190221309662, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.19738534092903137, + "grad_norm": 2.7875778675079346, + "kl": 2.61328125, + "learning_rate": 6.38448055080128e-07, + "loss": 0.1502, + "num_tokens": 878722163.0, + "reward": 1.0810546875, + "reward_std": 0.2939624786376953, + "rewards/accuracy_reward/mean": 0.1391129046678543, + "rewards/accuracy_reward/std": 0.34641367197036743, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.17451083660125732, "step": 1446 }, { @@ -41949,27 +41949,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 918.84375, - "completions/mean_terminated_length": 855.9835205078125, - "completions/min_length": 229.0, - "completions/min_terminated_length": 229.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1718.0, + "completions/max_terminated_length": 1718.0, + "completions/mean_length": 727.11328125, + "completions/mean_terminated_length": 727.11328125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, "epoch": 0.49398310147648716, - "grad_norm": 1.9435086250305176, - "kl": 5.421875, - "learning_rate": 6.376263242028048e-07, - "loss": 0.3275, - "num_tokens": 814781123.0, - "reward": 1.7568359375, - "reward_std": 0.5942944288253784, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.9052734375, - "rewards/tag_count_reward/std": 0.20881156623363495, + "grad_norm": 4.331711292266846, + "kl": 3.35546875, + "learning_rate": 6.379221425756913e-07, + "loss": 0.1257, + "num_tokens": 879169885.0, + "reward": 0.99951171875, + "reward_std": 0.2779485583305359, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.19528773427009583, "step": 1447 }, { @@ -41978,27 +41978,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1965.0, - "completions/mean_length": 814.65234375, - "completions/mean_terminated_length": 753.995849609375, - "completions/min_length": 2.0, - "completions/min_terminated_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1764.0, + "completions/max_terminated_length": 1764.0, + "completions/mean_length": 635.263671875, + "completions/mean_terminated_length": 635.263671875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, "epoch": 0.494324485789878, - "grad_norm": 1.8552683591842651, - "kl": 5.2578125, - "learning_rate": 6.371004164432853e-07, - "loss": 0.3284, - "num_tokens": 815281457.0, - "reward": 1.80126953125, - "reward_std": 0.5525183081626892, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.20247536897659302, + "grad_norm": 2.922173261642456, + "kl": 2.845703125, + "learning_rate": 6.373961051871552e-07, + "loss": 0.1321, + "num_tokens": 879578372.0, + "reward": 1.03759765625, + "reward_std": 0.20199885964393616, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.95166015625, + "rewards/tag_count_reward/std": 0.14328470826148987, "step": 1448 }, { @@ -42007,27 +42007,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 859.658203125, - "completions/mean_terminated_length": 818.8464965820312, - "completions/min_length": 2.0, - "completions/min_terminated_length": 2.0, + "completions/max_terminated_length": 1546.0, + "completions/mean_length": 694.490234375, + "completions/mean_terminated_length": 689.182373046875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.49466587010326873, - "grad_norm": 1.342028260231018, - "kl": 5.1875, - "learning_rate": 6.365743850606555e-07, - "loss": 0.3233, - "num_tokens": 815793154.0, - "reward": 1.80224609375, - "reward_std": 0.5543564558029175, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.19882753491401672, + "grad_norm": 3.038449287414551, + "kl": 3.423828125, + "learning_rate": 6.368699436617006e-07, + "loss": 0.1548, + "num_tokens": 880005503.0, + "reward": 1.0234375, + "reward_std": 0.24409040808677673, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.1698969453573227, "step": 1449 }, { @@ -42036,27 +42036,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 841.158203125, - "completions/mean_terminated_length": 786.9734497070312, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1937.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 670.849609375, + "completions/mean_terminated_length": 670.849609375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.49500725441665955, - "grad_norm": 1.3957794904708862, - "kl": 6.26953125, - "learning_rate": 6.360482308015209e-07, - "loss": 0.3895, - "num_tokens": 816302019.0, - "reward": 1.744140625, - "reward_std": 0.6250890493392944, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.765625, - "rewards/format_reward/std": 0.42402184009552, - "rewards/tag_count_reward/mean": 0.892578125, - "rewards/tag_count_reward/std": 0.21543563902378082, + "grad_norm": 3.174421787261963, + "kl": 2.32421875, + "learning_rate": 6.363436587466842e-07, + "loss": 0.1106, + "num_tokens": 880427170.0, + "reward": 1.00732421875, + "reward_std": 0.21429546177387238, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.1671748012304306, "step": 1450 }, { @@ -42065,27 +42065,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 813.330078125, - "completions/mean_terminated_length": 755.2576293945312, - "completions/min_length": 8.0, - "completions/min_terminated_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1443.0, + "completions/max_terminated_length": 1443.0, + "completions/mean_length": 635.716796875, + "completions/mean_terminated_length": 635.716796875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.49534863873005036, - "grad_norm": 2.0098752975463867, - "kl": 5.5234375, - "learning_rate": 6.35521954412661e-07, - "loss": 0.3354, - "num_tokens": 816805356.0, - "reward": 1.806640625, - "reward_std": 0.5330133438110352, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.1976810246706009, + "grad_norm": 1.7578914165496826, + "kl": 2.0078125, + "learning_rate": 6.358172511896381e-07, + "loss": 0.0854, + "num_tokens": 880839569.0, + "reward": 1.04052734375, + "reward_std": 0.20281007885932922, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.95458984375, + "rewards/tag_count_reward/std": 0.14167514443397522, "step": 1451 }, { @@ -42094,27 +42094,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1937.0, - "completions/mean_length": 828.271484375, - "completions/mean_terminated_length": 781.263671875, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 1712.0, + "completions/mean_length": 694.564453125, + "completions/mean_terminated_length": 689.2568969726562, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.4956900230434412, - "grad_norm": 1.8008296489715576, - "kl": 5.1953125, - "learning_rate": 6.34995556641029e-07, - "loss": 0.3325, - "num_tokens": 817309447.0, - "reward": 1.81640625, - "reward_std": 0.5670454502105713, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.189053013920784, + "grad_norm": 4.483496189117432, + "kl": 3.23828125, + "learning_rate": 6.352907217382684e-07, + "loss": 0.1919, + "num_tokens": 881275202.0, + "reward": 1.03271484375, + "reward_std": 0.257063627243042, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.17916527390480042, "step": 1452 }, { @@ -42123,27 +42123,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 813.849609375, - "completions/mean_terminated_length": 761.065185546875, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/max_terminated_length": 1627.0, + "completions/mean_length": 680.03125, + "completions/mean_terminated_length": 677.3541870117188, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, "epoch": 0.49603140735683193, - "grad_norm": 1.881151795387268, - "kl": 5.02734375, - "learning_rate": 6.344690382337503e-07, - "loss": 0.3325, - "num_tokens": 817805530.0, - "reward": 1.83740234375, - "reward_std": 0.5921496748924255, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.20155774056911469, + "grad_norm": 6.235518455505371, + "kl": 2.076171875, + "learning_rate": 6.347640711404545e-07, + "loss": 0.1026, + "num_tokens": 881702770.0, + "reward": 1.09130859375, + "reward_std": 0.23688587546348572, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.94873046875, + "rewards/tag_count_reward/std": 0.14979879558086395, "step": 1453 }, { @@ -42152,27 +42152,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 882.619140625, - "completions/mean_terminated_length": 837.7058715820312, - "completions/min_length": 101.0, - "completions/min_terminated_length": 101.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1951.0, + "completions/max_terminated_length": 1951.0, + "completions/mean_length": 753.25390625, + "completions/mean_terminated_length": 753.25390625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.49637279167022275, - "grad_norm": 1.026684284210205, - "kl": 5.52734375, - "learning_rate": 6.339423999381216e-07, - "loss": 0.3304, - "num_tokens": 818334375.0, - "reward": 1.7353515625, - "reward_std": 0.6159314513206482, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.20328371226787567, + "grad_norm": 2.465982437133789, + "kl": 3.2734375, + "learning_rate": 6.342373001442476e-07, + "loss": 0.1835, + "num_tokens": 882165380.0, + "reward": 0.99365234375, + "reward_std": 0.23353055119514465, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.18427632749080658, "step": 1454 }, { @@ -42181,27 +42181,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 783.880859375, - "completions/mean_terminated_length": 750.9478759765625, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1541.0, + "completions/max_terminated_length": 1541.0, + "completions/mean_length": 677.916015625, + "completions/mean_terminated_length": 677.916015625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.49671417598361356, - "grad_norm": 2.7136144638061523, - "kl": 4.9375, - "learning_rate": 6.334156425016091e-07, - "loss": 0.3283, - "num_tokens": 818810810.0, - "reward": 1.84033203125, - "reward_std": 0.5457237958908081, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.1893402636051178, + "grad_norm": 3.203700065612793, + "kl": 1.634765625, + "learning_rate": 6.337104094978705e-07, + "loss": 0.085, + "num_tokens": 882587561.0, + "reward": 1.05712890625, + "reward_std": 0.19913287460803986, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.95947265625, + "rewards/tag_count_reward/std": 0.13969184458255768, "step": 1455 }, { @@ -42210,27 +42210,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 839.83984375, - "completions/mean_terminated_length": 795.8178100585938, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/max_terminated_length": 1817.0, + "completions/mean_length": 740.923828125, + "completions/mean_terminated_length": 733.2200927734375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, "epoch": 0.4970555602970044, - "grad_norm": 1.8540130853652954, - "kl": 4.953125, - "learning_rate": 6.328887666718493e-07, - "loss": 0.3185, - "num_tokens": 819316904.0, - "reward": 1.8076171875, - "reward_std": 0.5355762243270874, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.18530340492725372, + "grad_norm": 2.1927859783172607, + "kl": 2.927734375, + "learning_rate": 6.331833999497157e-07, + "loss": 0.1681, + "num_tokens": 883043010.0, + "reward": 0.9970703125, + "reward_std": 0.20995213091373444, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.1709035038948059, "step": 1456 }, { @@ -42239,27 +42239,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 819.150390625, - "completions/mean_terminated_length": 782.0623168945312, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1907.0, + "completions/max_terminated_length": 1907.0, + "completions/mean_length": 712.681640625, + "completions/mean_terminated_length": 712.681640625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, "epoch": 0.49739694461039513, - "grad_norm": 2.3606650829315186, - "kl": 5.5625, - "learning_rate": 6.323617731966456e-07, - "loss": 0.3095, - "num_tokens": 819824597.0, - "reward": 1.7998046875, - "reward_std": 0.542791485786438, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.18468356132507324, + "grad_norm": 4.633256912231445, + "kl": 3.5078125, + "learning_rate": 6.326562722483442e-07, + "loss": 0.2041, + "num_tokens": 883496191.0, + "reward": 1.0302734375, + "reward_std": 0.2887267470359802, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.17514485120773315, "step": 1457 }, { @@ -42268,27 +42268,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1919.0, - "completions/mean_length": 818.34375, - "completions/mean_terminated_length": 744.5134887695312, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1533.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 667.42578125, + "completions/mean_terminated_length": 667.42578125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, "epoch": 0.49773832892378594, - "grad_norm": 3.7965686321258545, - "kl": 6.5625, - "learning_rate": 6.318346628239691e-07, - "loss": 0.3538, - "num_tokens": 820333333.0, - "reward": 1.7490234375, - "reward_std": 0.6212472915649414, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.771484375, - "rewards/format_reward/std": 0.4202871024608612, - "rewards/tag_count_reward/mean": 0.8896484375, - "rewards/tag_count_reward/std": 0.21903157234191895, + "grad_norm": 2.6285793781280518, + "kl": 3.3359375, + "learning_rate": 6.32129027142485e-07, + "loss": 0.1633, + "num_tokens": 883927657.0, + "reward": 1.04248046875, + "reward_std": 0.28445982933044434, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.18481998145580292, "step": 1458 }, { @@ -42297,27 +42297,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 796.962890625, - "completions/mean_terminated_length": 769.4949951171875, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 1791.0, + "completions/mean_length": 679.318359375, + "completions/mean_terminated_length": 676.639892578125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.49807971323717676, - "grad_norm": 1.7308908700942993, - "kl": 5.609375, - "learning_rate": 6.313074363019565e-07, - "loss": 0.334, - "num_tokens": 820825522.0, - "reward": 1.82958984375, - "reward_std": 0.505888044834137, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.17706766724586487, + "grad_norm": 2.5234105587005615, + "kl": 3.17578125, + "learning_rate": 6.316016653810344e-07, + "loss": 0.1659, + "num_tokens": 884359612.0, + "reward": 1.0029296875, + "reward_std": 0.24092786014080048, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.18704843521118164, "step": 1459 }, { @@ -42326,27 +42326,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 829.49609375, - "completions/mean_terminated_length": 787.6484985351562, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 1826.0, + "completions/mean_length": 687.453125, + "completions/mean_terminated_length": 682.11767578125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.49842109755056757, - "grad_norm": 1.9581514596939087, - "kl": 4.453125, - "learning_rate": 6.307800943789093e-07, - "loss": 0.2748, - "num_tokens": 821322912.0, - "reward": 1.818359375, - "reward_std": 0.529055118560791, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.17598573863506317, + "grad_norm": 3.5279128551483154, + "kl": 3.5859375, + "learning_rate": 6.310741877130537e-07, + "loss": 0.1742, + "num_tokens": 884784276.0, + "reward": 1.02685546875, + "reward_std": 0.2783900499343872, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.1692369133234024, "step": 1460 }, { @@ -42355,27 +42355,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1868.0, - "completions/mean_length": 759.083984375, - "completions/mean_terminated_length": 728.1500244140625, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/max_terminated_length": 1242.0, + "completions/mean_length": 620.599609375, + "completions/mean_terminated_length": 617.8062744140625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.49876248186395833, - "grad_norm": 1.587014079093933, - "kl": 4.15234375, - "learning_rate": 6.302526378032931e-07, - "loss": 0.2444, - "num_tokens": 821785755.0, - "reward": 1.84375, - "reward_std": 0.5097728967666626, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.16840559244155884, + "grad_norm": 4.12233304977417, + "kl": 3.0859375, + "learning_rate": 6.305465948877691e-07, + "loss": 0.153, + "num_tokens": 885176215.0, + "reward": 1.01904296875, + "reward_std": 0.24339976906776428, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.94091796875, + "rewards/tag_count_reward/std": 0.16344067454338074, "step": 1461 }, { @@ -42384,27 +42384,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 812.93359375, - "completions/mean_terminated_length": 767.9312133789062, - "completions/min_length": 2.0, - "completions/min_terminated_length": 2.0, + "completions/max_terminated_length": 1550.0, + "completions/mean_length": 675.787109375, + "completions/mean_terminated_length": 659.5158081054688, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.49910386617734914, - "grad_norm": 1.8485121726989746, - "kl": 4.40625, - "learning_rate": 6.29725067323736e-07, - "loss": 0.2859, - "num_tokens": 822288585.0, - "reward": 1.83740234375, - "reward_std": 0.5184470415115356, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19196152687072754, + "grad_norm": 2.21496844291687, + "kl": 4.2578125, + "learning_rate": 6.300188876545705e-07, + "loss": 0.2417, + "num_tokens": 885608826.0, + "reward": 1.0068359375, + "reward_std": 0.26521173119544983, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.1972527801990509, "step": 1462 }, { @@ -42413,27 +42413,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 799.4375, - "completions/mean_terminated_length": 764.3373413085938, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1610.0, + "completions/max_terminated_length": 1610.0, + "completions/mean_length": 658.64453125, + "completions/mean_terminated_length": 658.64453125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.49944525049073996, - "grad_norm": 1.4282054901123047, - "kl": 4.373046875, - "learning_rate": 6.291973836890276e-07, - "loss": 0.2487, - "num_tokens": 822775737.0, - "reward": 1.8603515625, - "reward_std": 0.473154217004776, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.1731034219264984, + "grad_norm": 4.980697154998779, + "kl": 4.48828125, + "learning_rate": 6.294910667630099e-07, + "loss": 0.2304, + "num_tokens": 886023892.0, + "reward": 0.998046875, + "reward_std": 0.28366217017173767, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.1920318305492401, "step": 1463 }, { @@ -42442,27 +42442,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 760.02734375, - "completions/mean_terminated_length": 739.5833740234375, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1629.0, + "completions/max_terminated_length": 1629.0, + "completions/mean_length": 619.50390625, + "completions/mean_terminated_length": 619.50390625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, "epoch": 0.49978663480413077, - "grad_norm": 1.2178758382797241, - "kl": 4.06640625, - "learning_rate": 6.286695876481185e-07, - "loss": 0.2455, - "num_tokens": 823243271.0, - "reward": 1.90478515625, - "reward_std": 0.540515661239624, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93798828125, - "rewards/tag_count_reward/std": 0.16826865077018738, + "grad_norm": 2.6251678466796875, + "kl": 3.046875, + "learning_rate": 6.289631329628014e-07, + "loss": 0.1529, + "num_tokens": 886419478.0, + "reward": 1.03271484375, + "reward_std": 0.2458207905292511, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.18215365707874298, "step": 1464 }, { @@ -42471,27 +42471,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 841.025390625, - "completions/mean_terminated_length": 797.0465698242188, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 685.33203125, + "completions/mean_terminated_length": 682.6653442382812, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, "epoch": 0.5001280191175216, - "grad_norm": 1.0786937475204468, - "kl": 5.9921875, - "learning_rate": 6.281416799501187e-07, - "loss": 0.3702, - "num_tokens": 823754836.0, - "reward": 1.82177734375, - "reward_std": 0.5706958770751953, - "rewards/accuracy_reward/mean": 0.06854838877916336, - "rewards/accuracy_reward/std": 0.25293973088264465, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20408765971660614, + "grad_norm": 7.054671287536621, + "kl": 4.20703125, + "learning_rate": 6.28435087003819e-07, + "loss": 0.1922, + "num_tokens": 886851328.0, + "reward": 0.99267578125, + "reward_std": 0.28385084867477417, + "rewards/accuracy_reward/mean": 0.07056451588869095, + "rewards/accuracy_reward/std": 0.25635457038879395, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.19168755412101746, "step": 1465 }, { @@ -42500,27 +42500,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1978.0, - "completions/mean_length": 797.427734375, - "completions/mean_terminated_length": 759.68408203125, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_terminated_length": 1403.0, + "completions/mean_length": 671.296875, + "completions/mean_terminated_length": 668.6027221679688, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, "epoch": 0.5004694034309124, - "grad_norm": 2.623847723007202, - "kl": 5.453125, - "learning_rate": 6.276136613442964e-07, - "loss": 0.339, - "num_tokens": 824239423.0, - "reward": 1.82373046875, - "reward_std": 0.5397425889968872, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.18874886631965637, + "grad_norm": 5.145280361175537, + "kl": 2.888671875, + "learning_rate": 6.279069296360957e-07, + "loss": 0.1298, + "num_tokens": 887271336.0, + "reward": 0.986328125, + "reward_std": 0.22993148863315582, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.17459021508693695, "step": 1466 }, { @@ -42529,27 +42529,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1756.0, - "completions/mean_length": 718.177734375, - "completions/mean_terminated_length": 683.5330810546875, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 604.04296875, + "completions/mean_terminated_length": 601.2172241210938, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.5008107877443031, - "grad_norm": 2.947995662689209, - "kl": 4.3125, - "learning_rate": 6.270855325800775e-07, - "loss": 0.2933, - "num_tokens": 824685594.0, - "reward": 1.8330078125, - "reward_std": 0.41921019554138184, - "rewards/accuracy_reward/mean": 0.026209676638245583, - "rewards/accuracy_reward/std": 0.1599196493625641, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.17166221141815186, + "grad_norm": 2.996569871902466, + "kl": 2.8515625, + "learning_rate": 6.273786616098238e-07, + "loss": 0.1923, + "num_tokens": 887659070.0, + "reward": 0.98095703125, + "reward_std": 0.22018375992774963, + "rewards/accuracy_reward/mean": 0.04838709533214569, + "rewards/accuracy_reward/std": 0.2147994488477707, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.17241966724395752, "step": 1467 }, { @@ -42558,27 +42558,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 834.5390625, - "completions/mean_terminated_length": 787.7728271484375, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1361.0, + "completions/max_terminated_length": 1361.0, + "completions/mean_length": 647.6328125, + "completions/mean_terminated_length": 647.6328125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, "epoch": 0.5011521720576939, - "grad_norm": 1.126750111579895, - "kl": 6.8203125, - "learning_rate": 6.265572944070444e-07, - "loss": 0.4545, - "num_tokens": 825185022.0, - "reward": 1.86865234375, - "reward_std": 0.5702815055847168, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, + "grad_norm": 2.6126527786254883, + "kl": 1.861328125, + "learning_rate": 6.268502836753516e-07, + "loss": 0.0651, + "num_tokens": 888062802.0, + "reward": 1.04052734375, + "reward_std": 0.2941199541091919, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19699282944202423, + "rewards/tag_count_reward/std": 0.17876482009887695, "step": 1468 }, { @@ -42587,27 +42587,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1895.0, - "completions/mean_length": 734.4140625, - "completions/mean_terminated_length": 700.1923828125, - "completions/min_length": 52.0, - "completions/min_terminated_length": 52.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1440.0, + "completions/max_terminated_length": 1440.0, + "completions/mean_length": 614.302734375, + "completions/mean_terminated_length": 614.302734375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, "epoch": 0.5014935563710847, - "grad_norm": 1.8038650751113892, - "kl": 6.36328125, - "learning_rate": 6.260289475749344e-07, - "loss": 0.4107, - "num_tokens": 825629890.0, - "reward": 1.83984375, - "reward_std": 0.4835994243621826, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.16318395733833313, + "grad_norm": 2.461759328842163, + "kl": 2.767578125, + "learning_rate": 6.263217965831844e-07, + "loss": 0.1411, + "num_tokens": 888446173.0, + "reward": 0.9736328125, + "reward_std": 0.2209916114807129, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.18026389181613922, "step": 1469 }, { @@ -42616,27 +42616,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1883.0, - "completions/mean_length": 810.154296875, - "completions/mean_terminated_length": 751.9324951171875, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1975.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 664.33984375, + "completions/mean_terminated_length": 664.33984375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, "epoch": 0.5018349406844755, - "grad_norm": 5.53438663482666, - "kl": 9.359375, - "learning_rate": 6.255004928336391e-07, - "loss": 0.5345, - "num_tokens": 826114833.0, - "reward": 1.818359375, - "reward_std": 0.6608902812004089, - "rewards/accuracy_reward/mean": 0.12109375, - "rewards/accuracy_reward/std": 0.3265552520751953, - "rewards/format_reward/mean": 0.798828125, - "rewards/format_reward/std": 0.4012683033943176, - "rewards/tag_count_reward/mean": 0.8984375, - "rewards/tag_count_reward/std": 0.22705358266830444, + "grad_norm": 1.6656148433685303, + "kl": 2.32421875, + "learning_rate": 6.25793201083982e-07, + "loss": 0.1188, + "num_tokens": 888856459.0, + "reward": 1.07470703125, + "reward_std": 0.312046080827713, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.17996348440647125, "step": 1470 }, { @@ -42645,27 +42645,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 801.361328125, - "completions/mean_terminated_length": 721.0166625976562, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1532.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 620.650390625, + "completions/mean_terminated_length": 620.650390625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.5021763249978664, - "grad_norm": 8.603954315185547, - "kl": 10.2109375, - "learning_rate": 6.249719309332036e-07, - "loss": 0.5929, - "num_tokens": 826606874.0, - "reward": 1.81201171875, - "reward_std": 0.5934995412826538, - "rewards/accuracy_reward/mean": 0.12298387289047241, - "rewards/accuracy_reward/std": 0.32875028252601624, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.21674399077892303, + "grad_norm": 1.9301904439926147, + "kl": 2.3046875, + "learning_rate": 6.252644979285583e-07, + "loss": 0.1102, + "num_tokens": 889255976.0, + "reward": 1.0556640625, + "reward_std": 0.23601169884204865, + "rewards/accuracy_reward/mean": 0.11895161122083664, + "rewards/accuracy_reward/std": 0.3240584135055542, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.16808471083641052, "step": 1471 }, { @@ -42674,27 +42674,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 809.072265625, - "completions/mean_terminated_length": 766.5232543945312, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1526.0, + "completions/max_terminated_length": 1526.0, + "completions/mean_length": 675.171875, + "completions/mean_terminated_length": 675.171875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, "epoch": 0.5025177093112572, - "grad_norm": 6.623929977416992, - "kl": 9.34375, - "learning_rate": 6.244432626238245e-07, - "loss": 0.4898, - "num_tokens": 827097119.0, - "reward": 1.740234375, - "reward_std": 0.5760315656661987, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.8984375, - "rewards/tag_count_reward/std": 0.2291981726884842, + "grad_norm": 2.869601249694824, + "kl": 2.052734375, + "learning_rate": 6.247356878678802e-07, + "loss": 0.1089, + "num_tokens": 889677664.0, + "reward": 0.99755859375, + "reward_std": 0.2592603266239166, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.16644155979156494, "step": 1472 }, { @@ -42703,27 +42703,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 816.001953125, - "completions/mean_terminated_length": 747.41650390625, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1557.0, + "completions/max_terminated_length": 1557.0, + "completions/mean_length": 640.119140625, + "completions/mean_terminated_length": 640.119140625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.502859093624648, - "grad_norm": 4.908143520355225, - "kl": 8.875, - "learning_rate": 6.239144886558501e-07, - "loss": 0.5392, - "num_tokens": 827588256.0, - "reward": 1.82763671875, - "reward_std": 0.5490024089813232, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.19795092940330505, + "grad_norm": 1.681730031967163, + "kl": 2.201171875, + "learning_rate": 6.242067716530666e-07, + "loss": 0.0958, + "num_tokens": 890078749.0, + "reward": 1.03076171875, + "reward_std": 0.27029573917388916, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.94482421875, + "rewards/tag_count_reward/std": 0.16180720925331116, "step": 1473 }, { @@ -42732,27 +42732,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 797.97265625, - "completions/mean_terminated_length": 755.04248046875, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 1487.0, + "completions/mean_length": 667.126953125, + "completions/mean_terminated_length": 664.4246826171875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, "epoch": 0.5032004779380388, - "grad_norm": 3.7246415615081787, - "kl": 8.203125, - "learning_rate": 6.23385609779778e-07, - "loss": 0.4694, - "num_tokens": 828080786.0, - "reward": 1.75048828125, - "reward_std": 0.5996809005737305, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.89697265625, - "rewards/tag_count_reward/std": 0.22064577043056488, + "grad_norm": 6.468199253082275, + "kl": 2.486328125, + "learning_rate": 6.23677750035387e-07, + "loss": 0.1734, + "num_tokens": 890504286.0, + "reward": 1.02099609375, + "reward_std": 0.2675877809524536, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.16495952010154724, "step": 1474 }, { @@ -42761,27 +42761,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 742.259765625, - "completions/mean_terminated_length": 697.4161987304688, - "completions/min_length": 9.0, - "completions/min_terminated_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1344.0, + "completions/max_terminated_length": 1344.0, + "completions/mean_length": 583.81640625, + "completions/mean_terminated_length": 583.81640625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, "epoch": 0.5035418622514295, - "grad_norm": 1.723221778869629, - "kl": 6.6484375, - "learning_rate": 6.228566267462555e-07, - "loss": 0.4491, - "num_tokens": 828548487.0, - "reward": 1.8017578125, - "reward_std": 0.5765774250030518, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.1966511756181717, + "grad_norm": 3.8578546047210693, + "kl": 2.193359375, + "learning_rate": 6.231486237662604e-07, + "loss": 0.1237, + "num_tokens": 890890864.0, + "reward": 1.0087890625, + "reward_std": 0.20193490386009216, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9462890625, + "rewards/tag_count_reward/std": 0.1581011861562729, "step": 1475 }, { @@ -42790,27 +42790,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.06640625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1878.0, - "completions/mean_length": 853.86328125, - "completions/mean_terminated_length": 768.9246826171875, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 675.978515625, + "completions/mean_terminated_length": 673.2935180664062, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.5038832465648203, - "grad_norm": 2.8293213844299316, - "kl": 7.3125, - "learning_rate": 6.22327540306077e-07, - "loss": 0.5051, - "num_tokens": 829065681.0, - "reward": 1.732421875, - "reward_std": 0.590774416923523, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.89453125, - "rewards/tag_count_reward/std": 0.22742362320423126, + "grad_norm": 6.085797309875488, + "kl": 4.3984375, + "learning_rate": 6.226193935972549e-07, + "loss": 0.2663, + "num_tokens": 891316981.0, + "reward": 0.98095703125, + "reward_std": 0.2453998625278473, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.91455078125, + "rewards/tag_count_reward/std": 0.18936549127101898, "step": 1476 }, { @@ -42819,27 +42819,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 735.369140625, - "completions/mean_terminated_length": 682.0101318359375, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 591.884765625, + "completions/mean_terminated_length": 589.0352172851562, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.5042246308782111, - "grad_norm": 3.193406343460083, - "kl": 5.20703125, - "learning_rate": 6.217983512101838e-07, - "loss": 0.3816, - "num_tokens": 829518894.0, - "reward": 1.84765625, - "reward_std": 0.4867114722728729, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.188242569565773, + "grad_norm": 3.0584208965301514, + "kl": 3.125, + "learning_rate": 6.220900602800858e-07, + "loss": 0.1769, + "num_tokens": 891696730.0, + "reward": 1.04833984375, + "reward_std": 0.24618935585021973, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.94482421875, + "rewards/tag_count_reward/std": 0.15564262866973877, "step": 1477 }, { @@ -42848,27 +42848,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 764.109375, - "completions/mean_terminated_length": 722.6935424804688, - "completions/min_length": 32.0, - "completions/min_terminated_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1320.0, + "completions/max_terminated_length": 1320.0, + "completions/mean_length": 649.302734375, + "completions/mean_terminated_length": 649.302734375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.5045660151916019, - "grad_norm": 4.938040733337402, - "kl": 4.45703125, - "learning_rate": 6.212690602096631e-07, - "loss": 0.3255, - "num_tokens": 829986182.0, - "reward": 1.87939453125, - "reward_std": 0.48974287509918213, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.1828918159008026, + "grad_norm": 4.202577590942383, + "kl": 3.58203125, + "learning_rate": 6.215606245666152e-07, + "loss": 0.159, + "num_tokens": 892105237.0, + "reward": 0.9990234375, + "reward_std": 0.23968106508255005, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.17926456034183502, "step": 1478 }, { @@ -42877,27 +42877,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1788.0, - "completions/mean_length": 800.89453125, - "completions/mean_terminated_length": 744.9020385742188, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 615.537109375, + "completions/mean_terminated_length": 612.7338256835938, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.5049073995049927, - "grad_norm": 1.9067102670669556, - "kl": 6.2265625, - "learning_rate": 6.207396680557468e-07, - "loss": 0.397, - "num_tokens": 830474224.0, - "reward": 1.791015625, - "reward_std": 0.549534797668457, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.20774950087070465, + "grad_norm": 7.4503374099731445, + "kl": 3.75390625, + "learning_rate": 6.210310872088502e-07, + "loss": 0.183, + "num_tokens": 892498376.0, + "reward": 1.0009765625, + "reward_std": 0.2416778802871704, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.17168447375297546, "step": 1479 }, { @@ -42906,27 +42906,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1949.0, - "completions/mean_length": 824.9296875, - "completions/mean_terminated_length": 767.40283203125, - "completions/min_length": 71.0, - "completions/min_terminated_length": 71.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1810.0, + "completions/max_terminated_length": 1810.0, + "completions/mean_length": 617.552734375, + "completions/mean_terminated_length": 617.552734375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.5052487838183836, - "grad_norm": 1.4719449281692505, - "kl": 7.59375, - "learning_rate": 6.202101754998101e-07, - "loss": 0.4883, - "num_tokens": 830974844.0, - "reward": 1.79345703125, - "reward_std": 0.5807920694351196, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.90478515625, - "rewards/tag_count_reward/std": 0.2146575003862381, + "grad_norm": 2.9471304416656494, + "kl": 3.45703125, + "learning_rate": 6.20501448958943e-07, + "loss": 0.1724, + "num_tokens": 892892819.0, + "reward": 0.99951171875, + "reward_std": 0.25932615995407104, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.18715250492095947, "step": 1480 }, { @@ -42935,27 +42935,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 859.779296875, - "completions/mean_terminated_length": 785.8236694335938, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 1615.0, + "completions/mean_length": 662.658203125, + "completions/mean_terminated_length": 654.4931640625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, "epoch": 0.5055901681317744, - "grad_norm": 1.5499788522720337, - "kl": 9.5, - "learning_rate": 6.196805832933709e-07, - "loss": 0.6123, - "num_tokens": 831489627.0, - "reward": 1.7060546875, - "reward_std": 0.6194310784339905, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.783203125, - "rewards/format_reward/std": 0.4124660789966583, - "rewards/tag_count_reward/mean": 0.8955078125, - "rewards/tag_count_reward/std": 0.2257176786661148, + "grad_norm": 5.1966376304626465, + "kl": 3.9375, + "learning_rate": 6.199717105691884e-07, + "loss": 0.2327, + "num_tokens": 893306676.0, + "reward": 0.9609375, + "reward_std": 0.20596902072429657, + "rewards/accuracy_reward/mean": 0.033203125, + "rewards/accuracy_reward/std": 0.17934183776378632, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.18188132345676422, "step": 1481 }, { @@ -42964,27 +42964,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1846.0, - "completions/mean_length": 763.044921875, - "completions/mean_terminated_length": 729.5691528320312, - "completions/min_length": 72.0, - "completions/min_terminated_length": 72.0, + "completions/max_terminated_length": 1225.0, + "completions/mean_length": 586.810546875, + "completions/mean_terminated_length": 583.9510498046875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, "epoch": 0.5059315524451652, - "grad_norm": 2.0553057193756104, - "kl": 5.703125, - "learning_rate": 6.191508921880886e-07, - "loss": 0.3149, - "num_tokens": 831953186.0, - "reward": 1.857421875, - "reward_std": 0.5050795674324036, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.18991030752658844, + "grad_norm": 5.172822952270508, + "kl": 3.4140625, + "learning_rate": 6.194418727920238e-07, + "loss": 0.1734, + "num_tokens": 893680003.0, + "reward": 0.97607421875, + "reward_std": 0.26199817657470703, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.91162109375, + "rewards/tag_count_reward/std": 0.19377975165843964, "step": 1482 }, { @@ -42993,27 +42993,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1725.0, - "completions/mean_length": 859.18359375, - "completions/mean_terminated_length": 798.1560668945312, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2047.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 662.6171875, + "completions/mean_terminated_length": 662.6171875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.5062729367585559, - "grad_norm": 1.0283100605010986, - "kl": 7.671875, - "learning_rate": 6.186211029357625e-07, - "loss": 0.4849, - "num_tokens": 832475232.0, - "reward": 1.84228515625, - "reward_std": 0.6034194231033325, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.20937539637088776, + "grad_norm": 2.583857774734497, + "kl": 2.87890625, + "learning_rate": 6.189119363800277e-07, + "loss": 0.1525, + "num_tokens": 894101407.0, + "reward": 1.06982421875, + "reward_std": 0.28229930996894836, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.18187542259693146, "step": 1483 }, { @@ -43022,27 +43022,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1976.0, - "completions/mean_length": 799.09375, - "completions/mean_terminated_length": 750.96142578125, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2037.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 614.740234375, + "completions/mean_terminated_length": 614.740234375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, "epoch": 0.5066143210719467, - "grad_norm": 2.066352605819702, - "kl": 6.2890625, - "learning_rate": 6.180912162883318e-07, - "loss": 0.4158, - "num_tokens": 832958992.0, - "reward": 1.8359375, - "reward_std": 0.45680689811706543, - "rewards/accuracy_reward/mean": 0.024193547666072845, - "rewards/accuracy_reward/std": 0.15380479395389557, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.1895880103111267, + "grad_norm": 2.175387144088745, + "kl": 2.99609375, + "learning_rate": 6.183819020859187e-07, + "loss": 0.1856, + "num_tokens": 894490778.0, + "reward": 0.98876953125, + "reward_std": 0.23335842788219452, + "rewards/accuracy_reward/mean": 0.04838709533214569, + "rewards/accuracy_reward/std": 0.21479946374893188, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.18019695580005646, "step": 1484 }, { @@ -43051,27 +43051,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 780.0859375, - "completions/mean_terminated_length": 741.8189086914062, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1648.0, + "completions/max_terminated_length": 1648.0, + "completions/mean_length": 607.95703125, + "completions/mean_terminated_length": 607.95703125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.5069557053853375, - "grad_norm": 2.2035834789276123, - "kl": 7.03125, - "learning_rate": 6.175612329978737e-07, - "loss": 0.431, - "num_tokens": 833437820.0, - "reward": 1.83984375, - "reward_std": 0.5861037373542786, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.1937359869480133, + "grad_norm": 4.318722248077393, + "kl": 1.8984375, + "learning_rate": 6.178517706625544e-07, + "loss": 0.1037, + "num_tokens": 894881476.0, + "reward": 1.0302734375, + "reward_std": 0.21671685576438904, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.16057194769382477, "step": 1485 }, { @@ -43080,27 +43080,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 825.13671875, - "completions/mean_terminated_length": 775.4268188476562, - "completions/min_length": 55.0, - "completions/min_terminated_length": 55.0, + "completions/max_terminated_length": 1800.0, + "completions/mean_length": 636.7578125, + "completions/mean_terminated_length": 633.99609375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.5072970896987283, - "grad_norm": 1.061333179473877, - "kl": 6.2578125, - "learning_rate": 6.170311538166026e-07, - "loss": 0.3749, - "num_tokens": 833948498.0, - "reward": 1.78759765625, - "reward_std": 0.5507444739341736, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.2099587321281433, + "grad_norm": 2.169992685317993, + "kl": 2.53125, + "learning_rate": 6.173215428629303e-07, + "loss": 0.1215, + "num_tokens": 895295704.0, + "reward": 1.0078125, + "reward_std": 0.26474007964134216, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.182897686958313, "step": 1486 }, { @@ -43109,27 +43109,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1940.0, - "completions/mean_length": 851.1640625, - "completions/mean_terminated_length": 794.8711547851562, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 1750.0, + "completions/mean_length": 660.32421875, + "completions/mean_terminated_length": 652.1453857421875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.5076384740121191, - "grad_norm": 2.173102617263794, - "kl": 8.109375, - "learning_rate": 6.165009794968687e-07, - "loss": 0.4881, - "num_tokens": 834458966.0, - "reward": 1.7919921875, - "reward_std": 0.5524708032608032, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.20217134058475494, + "grad_norm": 3.2754616737365723, + "kl": 2.517578125, + "learning_rate": 6.167912194401791e-07, + "loss": 0.1365, + "num_tokens": 895708462.0, + "reward": 0.9677734375, + "reward_std": 0.24602185189723969, + "rewards/accuracy_reward/mean": 0.03427419438958168, + "rewards/accuracy_reward/std": 0.18211629986763, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.18692579865455627, "step": 1487 }, { @@ -43138,27 +43138,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 863.71484375, - "completions/mean_terminated_length": 802.919921875, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/max_terminated_length": 1363.0, + "completions/mean_length": 650.39453125, + "completions/mean_terminated_length": 647.6594848632812, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, "epoch": 0.50797985832551, - "grad_norm": 3.464843273162842, - "kl": 9.125, - "learning_rate": 6.159707107911575e-07, - "loss": 0.5288, - "num_tokens": 834979844.0, - "reward": 1.80908203125, - "reward_std": 0.627461314201355, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.21342535316944122, + "grad_norm": 3.8435637950897217, + "kl": 1.765625, + "learning_rate": 6.162608011475687e-07, + "loss": 0.0945, + "num_tokens": 896120120.0, + "reward": 1.0595703125, + "reward_std": 0.2676447033882141, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9501953125, + "rewards/tag_count_reward/std": 0.14824466407299042, "step": 1488 }, { @@ -43167,27 +43167,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 811.58984375, - "completions/mean_terminated_length": 753.435546875, - "completions/min_length": 236.0, - "completions/min_terminated_length": 236.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1589.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 584.376953125, + "completions/mean_terminated_length": 584.376953125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.5083212426389008, - "grad_norm": 2.9201884269714355, - "kl": 9.1328125, - "learning_rate": 6.154403484520887e-07, - "loss": 0.5415, - "num_tokens": 835473794.0, - "reward": 1.810546875, - "reward_std": 0.5905405282974243, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.21728172898292542, + "grad_norm": 2.480565309524536, + "kl": 2.470703125, + "learning_rate": 6.157302887385028e-07, + "loss": 0.1452, + "num_tokens": 896497737.0, + "reward": 1.083984375, + "reward_std": 0.24171417951583862, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.955078125, + "rewards/tag_count_reward/std": 0.143970787525177, "step": 1489 }, { @@ -43196,27 +43196,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 810.4453125, - "completions/mean_terminated_length": 762.75048828125, - "completions/min_length": 67.0, - "completions/min_terminated_length": 67.0, + "completions/max_terminated_length": 1631.0, + "completions/mean_length": 669.638671875, + "completions/mean_terminated_length": 664.2333984375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, "epoch": 0.5086626269522916, - "grad_norm": 1.2315624952316284, - "kl": 6.859375, - "learning_rate": 6.149098932324145e-07, - "loss": 0.3881, - "num_tokens": 835964902.0, - "reward": 1.8662109375, - "reward_std": 0.5907376408576965, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.19744645059108734, + "grad_norm": 2.7596516609191895, + "kl": 3.109375, + "learning_rate": 6.151996829665176e-07, + "loss": 0.1661, + "num_tokens": 896916752.0, + "reward": 1.0322265625, + "reward_std": 0.2379935383796692, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.18715058267116547, "step": 1490 }, { @@ -43225,27 +43225,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 848.46484375, - "completions/mean_terminated_length": 776.4430541992188, - "completions/min_length": 66.0, - "completions/min_terminated_length": 66.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1470.0, + "completions/max_terminated_length": 1470.0, + "completions/mean_length": 611.220703125, + "completions/mean_terminated_length": 611.220703125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, "epoch": 0.5090040112656823, - "grad_norm": 1.9907715320587158, - "kl": 8.15625, - "learning_rate": 6.143793458850188e-07, - "loss": 0.5139, - "num_tokens": 836471988.0, - "reward": 1.80322265625, - "reward_std": 0.5664142370223999, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.90869140625, - "rewards/tag_count_reward/std": 0.21521764993667603, + "grad_norm": 2.4982738494873047, + "kl": 2.67578125, + "learning_rate": 6.146689845852825e-07, + "loss": 0.1347, + "num_tokens": 897302369.0, + "reward": 1.0458984375, + "reward_std": 0.26661649346351624, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.16881079971790314, "step": 1491 }, { @@ -43254,27 +43254,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 934.6953125, - "completions/mean_terminated_length": 870.2892456054688, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_terminated_length": 1469.0, + "completions/mean_length": 695.626953125, + "completions/mean_terminated_length": 690.3235473632812, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.5093453955790731, - "grad_norm": 0.9177165031433105, - "kl": 7.0234375, - "learning_rate": 6.13848707162917e-07, - "loss": 0.4453, - "num_tokens": 837022008.0, - "reward": 1.77197265625, - "reward_std": 0.5535103678703308, - "rewards/accuracy_reward/mean": 0.058467742055654526, - "rewards/accuracy_reward/std": 0.23486268520355225, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.20741750299930573, + "grad_norm": 1.733497142791748, + "kl": 3.06640625, + "learning_rate": 6.141381943485986e-07, + "loss": 0.1658, + "num_tokens": 897729986.0, + "reward": 1.01416015625, + "reward_std": 0.24004298448562622, + "rewards/accuracy_reward/mean": 0.0786290317773819, + "rewards/accuracy_reward/std": 0.26943063735961914, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.16810958087444305, "step": 1492 }, { @@ -43283,27 +43283,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1976.0, - "completions/mean_length": 886.91796875, - "completions/mean_terminated_length": 824.8024291992188, - "completions/min_length": 84.0, - "completions/min_terminated_length": 84.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1871.0, + "completions/max_terminated_length": 1871.0, + "completions/mean_length": 648.021484375, + "completions/mean_terminated_length": 648.021484375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, "epoch": 0.5096867798924639, - "grad_norm": 1.5554848909378052, - "kl": 8.2734375, - "learning_rate": 6.133179778192533e-07, - "loss": 0.4791, - "num_tokens": 837550670.0, - "reward": 1.74462890625, - "reward_std": 0.6169675588607788, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.77734375, - "rewards/format_reward/std": 0.41643625497817993, - "rewards/tag_count_reward/mean": 0.89501953125, - "rewards/tag_count_reward/std": 0.22358404099941254, + "grad_norm": 5.1812238693237305, + "kl": 3.890625, + "learning_rate": 6.136073130103972e-07, + "loss": 0.198, + "num_tokens": 898136333.0, + "reward": 1.05029296875, + "reward_std": 0.28683164715766907, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.17287351191043854, "step": 1493 }, { @@ -43312,27 +43312,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 883.537109375, - "completions/mean_terminated_length": 826.2683715820312, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1826.0, + "completions/max_terminated_length": 1826.0, + "completions/mean_length": 673.150390625, + "completions/mean_terminated_length": 673.150390625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, "epoch": 0.5100281642058547, - "grad_norm": 1.5476243495941162, - "kl": 6.25390625, - "learning_rate": 6.127871586073012e-07, - "loss": 0.3769, - "num_tokens": 838069713.0, - "reward": 1.84033203125, - "reward_std": 0.5659228563308716, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.20606790482997894, + "grad_norm": 3.852508783340454, + "kl": 3.76953125, + "learning_rate": 6.130763413247388e-07, + "loss": 0.2057, + "num_tokens": 898547658.0, + "reward": 1.04833984375, + "reward_std": 0.2567264139652252, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.1664358228445053, "step": 1494 }, { @@ -43341,27 +43341,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 818.41015625, - "completions/mean_terminated_length": 773.6072998046875, - "completions/min_length": 41.0, - "completions/min_terminated_length": 41.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1785.0, + "completions/max_terminated_length": 1785.0, + "completions/mean_length": 598.8359375, + "completions/mean_terminated_length": 598.8359375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, "epoch": 0.5103695485192455, - "grad_norm": 3.4402146339416504, - "kl": 5.52734375, - "learning_rate": 6.122562502804614e-07, - "loss": 0.3724, - "num_tokens": 838570083.0, - "reward": 1.84912109375, - "reward_std": 0.5798235535621643, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.19901487231254578, + "grad_norm": 2.0584053993225098, + "kl": 3.38671875, + "learning_rate": 6.125452800458128e-07, + "loss": 0.1869, + "num_tokens": 898935606.0, + "reward": 1.0498046875, + "reward_std": 0.2755335867404938, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.16064335405826569, "step": 1495 }, { @@ -43370,27 +43370,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 825.552734375, - "completions/mean_terminated_length": 768.05517578125, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1943.0, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 612.873046875, + "completions/mean_terminated_length": 612.873046875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.5107109328326364, - "grad_norm": 1.9902147054672241, - "kl": 5.7890625, - "learning_rate": 6.117252535922611e-07, - "loss": 0.3707, - "num_tokens": 839060926.0, - "reward": 1.7998046875, - "reward_std": 0.5655973553657532, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.2128543108701706, + "grad_norm": 3.267296075820923, + "kl": 4.2109375, + "learning_rate": 6.120141299279355e-07, + "loss": 0.2608, + "num_tokens": 899317557.0, + "reward": 1.05126953125, + "reward_std": 0.22006458044052124, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.94384765625, + "rewards/tag_count_reward/std": 0.1629781574010849, "step": 1496 }, { @@ -43399,27 +43399,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 831.1640625, - "completions/mean_terminated_length": 781.6991577148438, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1554.0, + "completions/max_terminated_length": 1554.0, + "completions/mean_length": 624.37890625, + "completions/mean_terminated_length": 624.37890625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, "epoch": 0.5110523171460272, - "grad_norm": 1.3155391216278076, - "kl": 6.11328125, - "learning_rate": 6.111941692963531e-07, - "loss": 0.3642, - "num_tokens": 839563186.0, - "reward": 1.84375, - "reward_std": 0.5692417621612549, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.19282633066177368, + "grad_norm": 5.56282377243042, + "kl": 3.28125, + "learning_rate": 6.114828917255493e-07, + "loss": 0.1686, + "num_tokens": 899713943.0, + "reward": 1.052734375, + "reward_std": 0.291049599647522, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.16985194385051727, "step": 1497 }, { @@ -43428,27 +43428,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 829.890625, - "completions/mean_terminated_length": 756.753662109375, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 1951.0, + "completions/mean_length": 643.279296875, + "completions/mean_terminated_length": 637.7706298828125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, "epoch": 0.511393701459418, - "grad_norm": 0.9072387218475342, - "kl": 6.34375, - "learning_rate": 6.106629981465142e-07, - "loss": 0.3769, - "num_tokens": 840072794.0, - "reward": 1.7841796875, - "reward_std": 0.6087607145309448, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.8994140625, - "rewards/tag_count_reward/std": 0.21871723234653473, + "grad_norm": 5.571586608886719, + "kl": 4.2734375, + "learning_rate": 6.109515661932221e-07, + "loss": 0.2214, + "num_tokens": 900128006.0, + "reward": 1.03955078125, + "reward_std": 0.2976807951927185, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17600135505199432, "step": 1498 }, { @@ -43457,27 +43457,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 795.34375, - "completions/mean_terminated_length": 767.84033203125, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 1292.0, + "completions/mean_length": 587.203125, + "completions/mean_terminated_length": 584.3444213867188, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.5117350857728087, - "grad_norm": 1.411439061164856, - "kl": 6.125, - "learning_rate": 6.101317408966451e-07, - "loss": 0.3668, - "num_tokens": 840557850.0, - "reward": 1.81591796875, - "reward_std": 0.5884963274002075, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.1925777792930603, + "grad_norm": 3.2270994186401367, + "kl": 2.85546875, + "learning_rate": 6.104201540856454e-07, + "loss": 0.1776, + "num_tokens": 900506494.0, + "reward": 1.06298828125, + "reward_std": 0.24954479932785034, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.94580078125, + "rewards/tag_count_reward/std": 0.15361572802066803, "step": 1499 }, { @@ -43486,27 +43486,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 870.6015625, - "completions/mean_terminated_length": 802.487548828125, - "completions/min_length": 89.0, - "completions/min_terminated_length": 89.0, + "completions/max_terminated_length": 1443.0, + "completions/mean_length": 627.25390625, + "completions/mean_terminated_length": 621.682373046875, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, "epoch": 0.5120764700861995, - "grad_norm": 2.008626937866211, - "kl": 8.2109375, - "learning_rate": 6.096003983007679e-07, - "loss": 0.4929, - "num_tokens": 841079662.0, - "reward": 1.78076171875, - "reward_std": 0.5997934341430664, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.89013671875, - "rewards/tag_count_reward/std": 0.2288304716348648, + "grad_norm": 2.260396718978882, + "kl": 2.76953125, + "learning_rate": 6.098886561576336e-07, + "loss": 0.1301, + "num_tokens": 900903712.0, + "reward": 1.0625, + "reward_std": 0.24853083491325378, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.947265625, + "rewards/tag_count_reward/std": 0.1537284255027771, "step": 1500 }, { @@ -43515,27 +43515,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 754.94140625, - "completions/mean_terminated_length": 710.5333862304688, - "completions/min_length": 114.0, - "completions/min_terminated_length": 114.0, + "completions/max_terminated_length": 1640.0, + "completions/mean_length": 583.193359375, + "completions/mean_terminated_length": 580.3267822265625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.5124178543995903, - "grad_norm": 1.2399321794509888, - "kl": 6.19921875, - "learning_rate": 6.090689711130263e-07, - "loss": 0.3782, - "num_tokens": 841540032.0, - "reward": 1.845703125, - "reward_std": 0.5702787637710571, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.912109375, - "rewards/tag_count_reward/std": 0.20473802089691162, + "grad_norm": 2.694723606109619, + "kl": 2.7734375, + "learning_rate": 6.093570731641236e-07, + "loss": 0.1991, + "num_tokens": 901276147.0, + "reward": 1.08984375, + "reward_std": 0.26263895630836487, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.1495569944381714, "step": 1501 }, { @@ -43544,27 +43544,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1961.0, - "completions/mean_length": 814.29296875, - "completions/mean_terminated_length": 756.2658081054688, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1791.0, + "completions/max_terminated_length": 1791.0, + "completions/mean_length": 620.205078125, + "completions/mean_terminated_length": 620.205078125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, "epoch": 0.5127592387129811, - "grad_norm": 1.3162981271743774, - "kl": 6.515625, - "learning_rate": 6.085374600876842e-07, - "loss": 0.4093, - "num_tokens": 842031254.0, - "reward": 1.78759765625, - "reward_std": 0.5792131423950195, - "rewards/accuracy_reward/mean": 0.07661290466785431, - "rewards/accuracy_reward/std": 0.2662447690963745, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.20737142860889435, + "grad_norm": 3.6175827980041504, + "kl": 2.2578125, + "learning_rate": 6.08825405860173e-07, + "loss": 0.1411, + "num_tokens": 901667996.0, + "reward": 1.04052734375, + "reward_std": 0.2540562152862549, + "rewards/accuracy_reward/mean": 0.08669354766607285, + "rewards/accuracy_reward/std": 0.281669557094574, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.94482421875, + "rewards/tag_count_reward/std": 0.1587548404932022, "step": 1502 }, { @@ -43573,27 +43573,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 862.115234375, - "completions/mean_terminated_length": 823.86083984375, - "completions/min_length": 36.0, - "completions/min_terminated_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2016.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 672.44140625, + "completions/mean_terminated_length": 672.44140625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.5131006230263719, - "grad_norm": 1.4820775985717773, - "kl": 5.3828125, - "learning_rate": 6.080058659791241e-07, - "loss": 0.335, - "num_tokens": 842546865.0, - "reward": 1.79736328125, - "reward_std": 0.5385974645614624, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.19190676510334015, + "grad_norm": 1.3615964651107788, + "kl": 2.4375, + "learning_rate": 6.082936550009584e-07, + "loss": 0.094, + "num_tokens": 902086494.0, + "reward": 1.013671875, + "reward_std": 0.21543973684310913, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.16393177211284637, "step": 1503 }, { @@ -43602,27 +43602,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 823.140625, - "completions/mean_terminated_length": 781.0747680664062, - "completions/min_length": 21.0, - "completions/min_terminated_length": 21.0, + "completions/max_terminated_length": 1257.0, + "completions/mean_length": 618.232421875, + "completions/mean_terminated_length": 592.6500854492188, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.5134420073397628, - "grad_norm": 1.1283048391342163, - "kl": 6.42578125, - "learning_rate": 6.074741895418466e-07, - "loss": 0.4094, - "num_tokens": 843043833.0, - "reward": 1.78857421875, - "reward_std": 0.5742803812026978, - "rewards/accuracy_reward/mean": 0.05645161122083664, - "rewards/accuracy_reward/std": 0.23102474212646484, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.21193371713161469, + "grad_norm": 2.3619580268859863, + "kl": 3.71484375, + "learning_rate": 6.077618213417761e-07, + "loss": 0.223, + "num_tokens": 902478549.0, + "reward": 1.00244140625, + "reward_std": 0.22305136919021606, + "rewards/accuracy_reward/mean": 0.06854838877916336, + "rewards/accuracy_reward/std": 0.25293970108032227, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.17677061259746552, "step": 1504 }, { @@ -43631,27 +43631,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1944.0, - "completions/mean_length": 830.80078125, - "completions/mean_terminated_length": 781.3211059570312, - "completions/min_length": 78.0, - "completions/min_terminated_length": 78.0, + "completions/max_terminated_length": 1560.0, + "completions/mean_length": 661.572265625, + "completions/mean_terminated_length": 658.8590698242188, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.5137833916531536, - "grad_norm": 2.193281650543213, - "kl": 5.4375, - "learning_rate": 6.069424315304693e-07, - "loss": 0.3357, - "num_tokens": 843549619.0, - "reward": 1.83837890625, - "reward_std": 0.5884426236152649, + "grad_norm": 3.8278415203094482, + "kl": 2.52734375, + "learning_rate": 6.072299056380392e-07, + "loss": 0.1647, + "num_tokens": 902897690.0, + "reward": 1.04296875, + "reward_std": 0.24075695872306824, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.19488608837127686, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.15868334472179413, "step": 1505 }, { @@ -43660,27 +43660,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 870.666015625, - "completions/mean_terminated_length": 797.3880004882812, - "completions/min_length": 199.0, - "completions/min_terminated_length": 199.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1631.0, + "completions/max_terminated_length": 1631.0, + "completions/mean_length": 685.716796875, + "completions/mean_terminated_length": 685.716796875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.5141247759665444, - "grad_norm": 2.0715394020080566, - "kl": 6.109375, - "learning_rate": 6.064105926997251e-07, - "loss": 0.4299, - "num_tokens": 844070584.0, - "reward": 1.818359375, - "reward_std": 0.5174299478530884, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.2096906155347824, + "grad_norm": 1.8310338258743286, + "kl": 2.056640625, + "learning_rate": 6.066979086452776e-07, + "loss": 0.0986, + "num_tokens": 903323961.0, + "reward": 1.0419921875, + "reward_std": 0.21650803089141846, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9521484375, + "rewards/tag_count_reward/std": 0.14556480944156647, "step": 1506 }, { @@ -43689,27 +43689,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1787.0, - "completions/mean_length": 823.134765625, - "completions/mean_terminated_length": 760.2567138671875, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1356.0, + "completions/max_terminated_length": 1356.0, + "completions/mean_length": 651.724609375, + "completions/mean_terminated_length": 651.724609375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.5144661602799352, - "grad_norm": 2.2035629749298096, - "kl": 6.359375, - "learning_rate": 6.058786738044626e-07, - "loss": 0.4295, - "num_tokens": 844560749.0, - "reward": 1.88525390625, - "reward_std": 0.5651792883872986, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.20031657814979553, + "grad_norm": 2.043776750564575, + "kl": 1.888671875, + "learning_rate": 6.061658311191371e-07, + "loss": 0.0671, + "num_tokens": 903726364.0, + "reward": 1.07763671875, + "reward_std": 0.2787034809589386, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.16112665832042694, "step": 1507 }, { @@ -43718,27 +43718,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1922.0, - "completions/mean_length": 833.3359375, - "completions/mean_terminated_length": 809.1394653320312, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1573.0, + "completions/max_terminated_length": 1573.0, + "completions/mean_length": 642.455078125, + "completions/mean_terminated_length": 642.455078125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.5148075445933259, - "grad_norm": 1.364289402961731, - "kl": 5.0703125, - "learning_rate": 6.053466755996427e-07, - "loss": 0.3246, - "num_tokens": 845066505.0, - "reward": 1.86083984375, - "reward_std": 0.5406478643417358, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.17912793159484863, + "grad_norm": 1.9324742555618286, + "kl": 1.84765625, + "learning_rate": 6.056336738153775e-07, + "loss": 0.0579, + "num_tokens": 904134389.0, + "reward": 1.025390625, + "reward_std": 0.1923532634973526, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.947265625, + "rewards/tag_count_reward/std": 0.1421540379524231, "step": 1508 }, { @@ -43747,27 +43747,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 813.23828125, - "completions/mean_terminated_length": 760.427734375, - "completions/min_length": 11.0, - "completions/min_terminated_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1889.0, + "completions/max_terminated_length": 1889.0, + "completions/mean_length": 629.1875, + "completions/mean_terminated_length": 629.1875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.5151489289067167, - "grad_norm": 4.429231643676758, - "kl": 9.5078125, - "learning_rate": 6.0481459884034e-07, - "loss": 0.5563, - "num_tokens": 845555907.0, - "reward": 1.74072265625, - "reward_std": 0.5743678212165833, + "grad_norm": 3.028106212615967, + "kl": 1.689453125, + "learning_rate": 6.051014374898714e-07, + "loss": 0.0637, + "num_tokens": 904529557.0, + "reward": 0.998046875, + "reward_std": 0.19787713885307312, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.89697265625, - "rewards/tag_count_reward/std": 0.2282744199037552, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.13763225078582764, "step": 1509 }, { @@ -43776,27 +43776,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.072265625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 877.82421875, - "completions/mean_terminated_length": 786.6736450195312, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/max_terminated_length": 1866.0, + "completions/mean_length": 672.52734375, + "completions/mean_terminated_length": 667.1333618164062, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.5154903132201075, - "grad_norm": 4.857111930847168, - "kl": 11.640625, - "learning_rate": 6.042824442817399e-07, - "loss": 0.7312, - "num_tokens": 846076745.0, - "reward": 1.75830078125, - "reward_std": 0.6979130506515503, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.88916015625, - "rewards/tag_count_reward/std": 0.2336527705192566, + "grad_norm": 2.9959349632263184, + "kl": 1.919921875, + "learning_rate": 6.045691228986048e-07, + "loss": 0.128, + "num_tokens": 904945283.0, + "reward": 1.05078125, + "reward_std": 0.22685974836349487, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.1495569944381714, "step": 1510 }, { @@ -43805,27 +43805,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 793.458984375, - "completions/mean_terminated_length": 745.1094970703125, - "completions/min_length": 215.0, - "completions/min_terminated_length": 215.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1363.0, + "completions/max_terminated_length": 1363.0, + "completions/mean_length": 645.470703125, + "completions/mean_terminated_length": 645.470703125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, "epoch": 0.5158316975334983, - "grad_norm": 5.492738723754883, - "kl": 10.765625, - "learning_rate": 6.037502126791386e-07, - "loss": 0.6317, - "num_tokens": 846556900.0, - "reward": 1.72802734375, - "reward_std": 0.6071256995201111, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.783203125, - "rewards/format_reward/std": 0.4124660789966583, - "rewards/tag_count_reward/mean": 0.89404296875, - "rewards/tag_count_reward/std": 0.22421565651893616, + "grad_norm": 1.8489620685577393, + "kl": 2.0703125, + "learning_rate": 6.040367307976739e-07, + "loss": 0.0943, + "num_tokens": 905349668.0, + "reward": 1.00439453125, + "reward_std": 0.21467621624469757, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.94580078125, + "rewards/tag_count_reward/std": 0.15519995987415314, "step": 1511 }, { @@ -43834,27 +43834,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 838.787109375, - "completions/mean_terminated_length": 774.0966796875, - "completions/min_length": 223.0, - "completions/min_terminated_length": 223.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1677.0, + "completions/max_terminated_length": 1677.0, + "completions/mean_length": 650.904296875, + "completions/mean_terminated_length": 650.904296875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.5161730818468891, - "grad_norm": 6.562992095947266, - "kl": 9.859375, - "learning_rate": 6.032179047879413e-07, - "loss": 0.5575, - "num_tokens": 847060663.0, - "reward": 1.7548828125, - "reward_std": 0.5794814229011536, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.796875, - "rewards/format_reward/std": 0.4027182459831238, - "rewards/tag_count_reward/mean": 0.8896484375, - "rewards/tag_count_reward/std": 0.23257318139076233, + "grad_norm": 2.1271045207977295, + "kl": 2.302734375, + "learning_rate": 6.035042619432853e-07, + "loss": 0.1046, + "num_tokens": 905757235.0, + "reward": 1.02294921875, + "reward_std": 0.2402125895023346, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.17432114481925964, "step": 1512 }, { @@ -43863,27 +43863,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 814.951171875, - "completions/mean_terminated_length": 754.3093872070312, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1643.0, + "completions/max_terminated_length": 1643.0, + "completions/mean_length": 651.849609375, + "completions/mean_terminated_length": 651.849609375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.51651446616028, - "grad_norm": 3.621229410171509, - "kl": 9.2265625, - "learning_rate": 6.026855213636619e-07, - "loss": 0.5589, - "num_tokens": 847558846.0, - "reward": 1.7900390625, - "reward_std": 0.6148122549057007, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.9033203125, - "rewards/tag_count_reward/std": 0.21824489533901215, + "grad_norm": 3.7923452854156494, + "kl": 2.0625, + "learning_rate": 6.029717170917549e-07, + "loss": 0.1166, + "num_tokens": 906171910.0, + "reward": 1.07275390625, + "reward_std": 0.21571412682533264, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.95166015625, + "rewards/tag_count_reward/std": 0.13982859253883362, "step": 1513 }, { @@ -43892,27 +43892,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 818.384765625, - "completions/mean_terminated_length": 752.6028442382812, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1750.0, + "completions/max_terminated_length": 1750.0, + "completions/mean_length": 684.29296875, + "completions/mean_terminated_length": 684.29296875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, "epoch": 0.5168558504736708, - "grad_norm": 1.4343737363815308, - "kl": 8.203125, - "learning_rate": 6.021530631619213e-07, - "loss": 0.5196, - "num_tokens": 848049523.0, - "reward": 1.75732421875, - "reward_std": 0.6363678574562073, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.89208984375, - "rewards/tag_count_reward/std": 0.23554269969463348, + "grad_norm": 2.008486270904541, + "kl": 2.017578125, + "learning_rate": 6.024390969995064e-07, + "loss": 0.0941, + "num_tokens": 906593932.0, + "reward": 1.0595703125, + "reward_std": 0.2769867479801178, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.16794821619987488, "step": 1514 }, { @@ -43921,27 +43921,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 852.2578125, - "completions/mean_terminated_length": 783.0826416015625, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 1542.0, + "completions/mean_length": 633.9375, + "completions/mean_terminated_length": 631.1702270507812, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, "epoch": 0.5171972347870616, - "grad_norm": 1.6349339485168457, - "kl": 7.8125, - "learning_rate": 6.016205309384466e-07, - "loss": 0.5089, - "num_tokens": 848560471.0, - "reward": 1.7529296875, - "reward_std": 0.6010799407958984, - "rewards/accuracy_reward/mean": 0.0463709682226181, - "rewards/accuracy_reward/std": 0.21049949526786804, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.9013671875, - "rewards/tag_count_reward/std": 0.2179294228553772, + "grad_norm": 2.8057196140289307, + "kl": 2.41796875, + "learning_rate": 6.019064024230697e-07, + "loss": 0.1181, + "num_tokens": 906993100.0, + "reward": 1.029296875, + "reward_std": 0.2821890413761139, + "rewards/accuracy_reward/mean": 0.09879032522439957, + "rewards/accuracy_reward/std": 0.2986815273761749, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.16907380521297455, "step": 1515 }, { @@ -43950,27 +43950,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 849.228515625, - "completions/mean_terminated_length": 766.6409301757812, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 1791.0, + "completions/mean_length": 654.7109375, + "completions/mean_terminated_length": 651.9843139648438, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, "epoch": 0.5175386191004523, - "grad_norm": 2.201406240463257, - "kl": 8.15625, - "learning_rate": 6.010879254490695e-07, - "loss": 0.5508, - "num_tokens": 849075580.0, - "reward": 1.72900390625, - "reward_std": 0.6683923006057739, - "rewards/accuracy_reward/mean": 0.07661290466785431, - "rewards/accuracy_reward/std": 0.2662447690963745, - "rewards/format_reward/mean": 0.76953125, - "rewards/format_reward/std": 0.42154473066329956, - "rewards/tag_count_reward/mean": 0.88525390625, - "rewards/tag_count_reward/std": 0.23903003334999084, + "grad_norm": 3.3453571796417236, + "kl": 2.453125, + "learning_rate": 6.013736341190814e-07, + "loss": 0.1267, + "num_tokens": 907408616.0, + "reward": 1.03076171875, + "reward_std": 0.2842212915420532, + "rewards/accuracy_reward/mean": 0.10483870655298233, + "rewards/accuracy_reward/std": 0.30665475130081177, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.16976682841777802, "step": 1516 }, { @@ -43979,27 +43979,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 786.662109375, - "completions/mean_terminated_length": 730.0305786132812, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_terminated_length": 1456.0, + "completions/mean_length": 628.134765625, + "completions/mean_terminated_length": 625.3561401367188, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.5178800034138431, - "grad_norm": 0.942610502243042, - "kl": 6.5546875, - "learning_rate": 6.005552474497264e-07, - "loss": 0.3864, - "num_tokens": 849555007.0, - "reward": 1.72607421875, - "reward_std": 0.5106169581413269, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.20726542174816132, + "grad_norm": 2.1664962768554688, + "kl": 2.7421875, + "learning_rate": 6.008407928442829e-07, + "loss": 0.1568, + "num_tokens": 907806877.0, + "reward": 0.955078125, + "reward_std": 0.18627232313156128, + "rewards/accuracy_reward/mean": 0.025390625, + "rewards/accuracy_reward/std": 0.15746226906776428, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.17667937278747559, "step": 1517 }, { @@ -44008,27 +44008,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 759.005859375, - "completions/mean_terminated_length": 720.1026000976562, - "completions/min_length": 52.0, - "completions/min_terminated_length": 52.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2032.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 601.3359375, + "completions/mean_terminated_length": 601.3359375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, "epoch": 0.5182213877272339, - "grad_norm": 2.7662010192871094, - "kl": 5.0625, - "learning_rate": 6.000224976964563e-07, - "loss": 0.3215, - "num_tokens": 850021538.0, - "reward": 1.810546875, - "reward_std": 0.5042428970336914, - "rewards/accuracy_reward/mean": 0.038306452333927155, - "rewards/accuracy_reward/std": 0.19212885200977325, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.19859731197357178, + "grad_norm": 2.062126636505127, + "kl": 2.275390625, + "learning_rate": 6.003078793555181e-07, + "loss": 0.1439, + "num_tokens": 908192681.0, + "reward": 0.984375, + "reward_std": 0.20142188668251038, + "rewards/accuracy_reward/mean": 0.0463709682226181, + "rewards/accuracy_reward/std": 0.21049949526786804, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.16328932344913483, "step": 1518 }, { @@ -44037,27 +44037,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 823.802734375, - "completions/mean_terminated_length": 781.7596435546875, - "completions/min_length": 63.0, - "completions/min_terminated_length": 63.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1614.0, + "completions/max_terminated_length": 1614.0, + "completions/mean_length": 622.44921875, + "completions/mean_terminated_length": 622.44921875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, "epoch": 0.5185627720406247, - "grad_norm": 2.674511671066284, - "kl": 4.62890625, - "learning_rate": 5.994896769453999e-07, - "loss": 0.3008, - "num_tokens": 850525933.0, - "reward": 1.857421875, - "reward_std": 0.513138473033905, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.1939331740140915, + "grad_norm": 5.56535005569458, + "kl": 2.501953125, + "learning_rate": 5.99774894409735e-07, + "loss": 0.1179, + "num_tokens": 908593983.0, + "reward": 1.0556640625, + "reward_std": 0.27925539016723633, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.1741819977760315, "step": 1519 }, { @@ -44066,27 +44066,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 861.314453125, - "completions/mean_terminated_length": 810.5601196289062, - "completions/min_length": 192.0, - "completions/min_terminated_length": 192.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1566.0, + "completions/max_terminated_length": 1566.0, + "completions/mean_length": 663.119140625, + "completions/mean_terminated_length": 663.119140625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, "epoch": 0.5189041563540155, - "grad_norm": 3.3661677837371826, - "kl": 4.93359375, - "learning_rate": 5.989567859527988e-07, - "loss": 0.3711, - "num_tokens": 851044750.0, - "reward": 1.86083984375, - "reward_std": 0.486860990524292, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.18643119931221008, + "grad_norm": 3.0676090717315674, + "kl": 3.271484375, + "learning_rate": 5.992418387639816e-07, + "loss": 0.1499, + "num_tokens": 909011324.0, + "reward": 1.01025390625, + "reward_std": 0.2641974687576294, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.91455078125, + "rewards/tag_count_reward/std": 0.18806926906108856, "step": 1520 }, { @@ -44095,27 +44095,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 837.64453125, - "completions/mean_terminated_length": 762.3112182617188, - "completions/min_length": 196.0, - "completions/min_terminated_length": 196.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1511.0, + "completions/max_terminated_length": 1511.0, + "completions/mean_length": 608.865234375, + "completions/mean_terminated_length": 608.865234375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.5192455406674064, - "grad_norm": 4.249598979949951, - "kl": 6.265625, - "learning_rate": 5.984238254749946e-07, - "loss": 0.4595, - "num_tokens": 851550600.0, - "reward": 1.78955078125, - "reward_std": 0.5405020117759705, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.20764772593975067, + "grad_norm": 2.0117838382720947, + "kl": 2.69140625, + "learning_rate": 5.987087131754073e-07, + "loss": 0.1487, + "num_tokens": 909400039.0, + "reward": 0.98046875, + "reward_std": 0.23321151733398438, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.17247577011585236, "step": 1521 }, { @@ -44124,27 +44124,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 814.98828125, - "completions/mean_terminated_length": 735.5218505859375, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1425.0, + "completions/max_terminated_length": 1425.0, + "completions/mean_length": 606.58203125, + "completions/mean_terminated_length": 606.58203125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.5195869249807972, - "grad_norm": 1.2901614904403687, - "kl": 8.71875, - "learning_rate": 5.978907962684267e-07, - "loss": 0.5619, - "num_tokens": 852039490.0, - "reward": 1.7841796875, - "reward_std": 0.5843145847320557, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.9052734375, - "rewards/tag_count_reward/std": 0.21910135447978973, + "grad_norm": 1.9403918981552124, + "kl": 3.015625, + "learning_rate": 5.981755184012607e-07, + "loss": 0.1952, + "num_tokens": 909782225.0, + "reward": 1.01611328125, + "reward_std": 0.24976907670497894, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.1784866452217102, "step": 1522 }, { @@ -44153,27 +44153,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 783.240234375, - "completions/mean_terminated_length": 715.5781860351562, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/max_terminated_length": 1337.0, + "completions/mean_length": 568.267578125, + "completions/mean_terminated_length": 565.371826171875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.519928309294188, - "grad_norm": 3.2713301181793213, - "kl": 8.2265625, - "learning_rate": 5.973576990896331e-07, - "loss": 0.4684, - "num_tokens": 852519293.0, - "reward": 1.78466796875, - "reward_std": 0.595682680606842, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.89599609375, - "rewards/tag_count_reward/std": 0.23102888464927673, + "grad_norm": 2.2290732860565186, + "kl": 2.74609375, + "learning_rate": 5.976422551988885e-07, + "loss": 0.1422, + "num_tokens": 910151962.0, + "reward": 1.07177734375, + "reward_std": 0.29213404655456543, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.16266712546348572, "step": 1523 }, { @@ -44182,27 +44182,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1957.0, - "completions/mean_length": 833.33984375, - "completions/mean_terminated_length": 781.3890380859375, - "completions/min_length": 74.0, - "completions/min_terminated_length": 74.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1491.0, + "completions/max_terminated_length": 1491.0, + "completions/mean_length": 597.623046875, + "completions/mean_terminated_length": 597.623046875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.5202696936075787, - "grad_norm": 2.170908212661743, - "kl": 8.73828125, - "learning_rate": 5.968245346952473e-07, - "loss": 0.5018, - "num_tokens": 853015963.0, - "reward": 1.72607421875, - "reward_std": 0.6007155179977417, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.796875, - "rewards/format_reward/std": 0.4027182459831238, - "rewards/tag_count_reward/mean": 0.89794921875, - "rewards/tag_count_reward/std": 0.2308426946401596, + "grad_norm": 2.628721237182617, + "kl": 3.2578125, + "learning_rate": 5.971089243257346e-07, + "loss": 0.1891, + "num_tokens": 910527945.0, + "reward": 0.982421875, + "reward_std": 0.22860752046108246, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.16148874163627625, "step": 1524 }, { @@ -44211,27 +44211,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.076171875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1840.0, - "completions/mean_length": 868.455078125, - "completions/mean_terminated_length": 771.19873046875, - "completions/min_length": 54.0, - "completions/min_terminated_length": 54.0, + "completions/max_terminated_length": 1469.0, + "completions/mean_length": 605.771484375, + "completions/mean_terminated_length": 597.2711181640625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.5206110779209695, - "grad_norm": 3.0966808795928955, - "kl": 10.578125, - "learning_rate": 5.962913038419988e-07, - "loss": 0.6346, - "num_tokens": 853539348.0, - "reward": 1.7158203125, - "reward_std": 0.6263778805732727, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.794921875, - "rewards/format_reward/std": 0.4041535556316376, - "rewards/tag_count_reward/mean": 0.8818359375, - "rewards/tag_count_reward/std": 0.24867978692054749, + "grad_norm": 2.0782299041748047, + "kl": 2.404296875, + "learning_rate": 5.965755265393389e-07, + "loss": 0.1361, + "num_tokens": 910916836.0, + "reward": 0.99658203125, + "reward_std": 0.21859559416770935, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.1730337291955948, "step": 1525 }, { @@ -44240,27 +44240,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 879.61328125, - "completions/mean_terminated_length": 824.658447265625, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 1441.0, + "completions/mean_length": 605.486328125, + "completions/mean_terminated_length": 602.6633911132812, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.5209524622343603, - "grad_norm": 1.897019624710083, - "kl": 8.2890625, - "learning_rate": 5.957580072867113e-07, - "loss": 0.4556, - "num_tokens": 854069854.0, - "reward": 1.720703125, - "reward_std": 0.6369937658309937, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.78125, - "rewards/format_reward/std": 0.41380295157432556, - "rewards/tag_count_reward/mean": 0.888671875, - "rewards/tag_count_reward/std": 0.2305201143026352, + "grad_norm": 1.7351100444793701, + "kl": 2.8125, + "learning_rate": 5.960420625973368e-07, + "loss": 0.168, + "num_tokens": 911306989.0, + "reward": 1.0244140625, + "reward_std": 0.2828201353549957, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.17582006752490997, "step": 1526 }, { @@ -44269,27 +44269,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.06640625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 876.908203125, - "completions/mean_terminated_length": 793.6087646484375, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1429.0, + "completions/max_terminated_length": 1429.0, + "completions/mean_length": 631.916015625, + "completions/mean_terminated_length": 631.916015625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.5212938465477511, - "grad_norm": 1.3214539289474487, - "kl": 8.0859375, - "learning_rate": 5.952246457863019e-07, - "loss": 0.4706, - "num_tokens": 854593455.0, - "reward": 1.72509765625, - "reward_std": 0.5969668626785278, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.787109375, - "rewards/format_reward/std": 0.409751296043396, - "rewards/tag_count_reward/mean": 0.88916015625, - "rewards/tag_count_reward/std": 0.23207706212997437, + "grad_norm": 3.1766157150268555, + "kl": 3.22265625, + "learning_rate": 5.955085332574572e-07, + "loss": 0.198, + "num_tokens": 911705154.0, + "reward": 1.00927734375, + "reward_std": 0.26710960268974304, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.18566079437732697, "step": 1527 }, { @@ -44298,27 +44298,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 864.830078125, - "completions/mean_terminated_length": 783.3173217773438, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1934.0, + "completions/max_terminated_length": 1934.0, + "completions/mean_length": 601.8046875, + "completions/mean_terminated_length": 601.8046875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.5216352308611419, - "grad_norm": 2.2895874977111816, - "kl": 8.375, - "learning_rate": 5.946912200977794e-07, - "loss": 0.4919, - "num_tokens": 855108840.0, - "reward": 1.73876953125, - "reward_std": 0.5946817398071289, - "rewards/accuracy_reward/mean": 0.06653226166963577, - "rewards/accuracy_reward/std": 0.2494617998600006, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.88916015625, - "rewards/tag_count_reward/std": 0.23625560104846954, + "grad_norm": 3.5937228202819824, + "kl": 2.771484375, + "learning_rate": 5.949749392775221e-07, + "loss": 0.1973, + "num_tokens": 912085870.0, + "reward": 1.04248046875, + "reward_std": 0.24215394258499146, + "rewards/accuracy_reward/mean": 0.10483870655298233, + "rewards/accuracy_reward/std": 0.30665475130081177, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.1678479015827179, "step": 1528 }, { @@ -44327,27 +44327,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 914.890625, - "completions/mean_terminated_length": 836.8267211914062, - "completions/min_length": 2.0, - "completions/min_terminated_length": 2.0, + "completions/max_terminated_length": 1401.0, + "completions/mean_length": 634.060546875, + "completions/mean_terminated_length": 631.2935180664062, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, "epoch": 0.5219766151745328, - "grad_norm": 1.369377851486206, - "kl": 8.4140625, - "learning_rate": 5.941577309782441e-07, - "loss": 0.5102, - "num_tokens": 855659632.0, - "reward": 1.7109375, - "reward_std": 0.6631219983100891, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.775390625, - "rewards/format_reward/std": 0.41773295402526855, - "rewards/tag_count_reward/mean": 0.87890625, - "rewards/tag_count_reward/std": 0.24377650022506714, + "grad_norm": 4.153964996337891, + "kl": 3.6484375, + "learning_rate": 5.944412814154454e-07, + "loss": 0.2237, + "num_tokens": 912492877.0, + "reward": 0.99169921875, + "reward_std": 0.25971394777297974, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.18451987206935883, "step": 1529 }, { @@ -44356,27 +44356,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1881.0, - "completions/mean_length": 908.5625, - "completions/mean_terminated_length": 842.6445922851562, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_terminated_length": 1884.0, + "completions/mean_length": 628.384765625, + "completions/mean_terminated_length": 622.8176879882812, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, "epoch": 0.5223179994879236, - "grad_norm": 2.3144214153289795, - "kl": 7.3203125, - "learning_rate": 5.936241791848863e-07, - "loss": 0.4573, - "num_tokens": 856205616.0, - "reward": 1.7529296875, - "reward_std": 0.644518256187439, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.787109375, - "rewards/format_reward/std": 0.409751296043396, - "rewards/tag_count_reward/mean": 0.8916015625, - "rewards/tag_count_reward/std": 0.22331729531288147, + "grad_norm": 3.7868857383728027, + "kl": 4.08984375, + "learning_rate": 5.939075604292317e-07, + "loss": 0.2528, + "num_tokens": 912895410.0, + "reward": 1.02490234375, + "reward_std": 0.29657381772994995, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.91162109375, + "rewards/tag_count_reward/std": 0.19314755499362946, "step": 1530 }, { @@ -44385,27 +44385,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 858.572265625, - "completions/mean_terminated_length": 779.277099609375, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1721.0, + "completions/max_terminated_length": 1721.0, + "completions/mean_length": 596.970703125, + "completions/mean_terminated_length": 596.970703125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, "epoch": 0.5226593838013144, - "grad_norm": 1.9225261211395264, - "kl": 6.0625, - "learning_rate": 5.930905654749848e-07, - "loss": 0.4011, - "num_tokens": 856718485.0, - "reward": 1.78369140625, - "reward_std": 0.556646466255188, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.8046875, - "rewards/format_reward/std": 0.3968288004398346, - "rewards/tag_count_reward/mean": 0.90283203125, - "rewards/tag_count_reward/std": 0.21774683892726898, + "grad_norm": 3.0284104347229004, + "kl": 3.55859375, + "learning_rate": 5.933737770769746e-07, + "loss": 0.231, + "num_tokens": 913274339.0, + "reward": 1.052734375, + "reward_std": 0.2362092286348343, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.17459021508693695, "step": 1531 }, { @@ -44414,27 +44414,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 847.63671875, - "completions/mean_terminated_length": 801.375244140625, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1877.0, + "completions/max_terminated_length": 1877.0, + "completions/mean_length": 605.32421875, + "completions/mean_terminated_length": 605.32421875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.5230007681147051, - "grad_norm": 4.881642818450928, - "kl": 4.9609375, - "learning_rate": 5.925568906059073e-07, - "loss": 0.333, - "num_tokens": 857230027.0, - "reward": 1.8037109375, - "reward_std": 0.5452776551246643, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.1946192979812622, + "grad_norm": 3.1990652084350586, + "kl": 3.48828125, + "learning_rate": 5.928399321168575e-07, + "loss": 0.1873, + "num_tokens": 913661817.0, + "reward": 0.97705078125, + "reward_std": 0.24768967926502228, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.17594705522060394, "step": 1532 }, { @@ -44443,27 +44443,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 944.017578125, - "completions/mean_terminated_length": 872.866943359375, - "completions/min_length": 74.0, - "completions/min_terminated_length": 74.0, + "completions/max_terminated_length": 1878.0, + "completions/mean_length": 691.36328125, + "completions/mean_terminated_length": 688.7084350585938, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.5233421524280959, - "grad_norm": 2.430119752883911, - "kl": 5.578125, - "learning_rate": 5.920231553351073e-07, - "loss": 0.3671, - "num_tokens": 857787924.0, - "reward": 1.78955078125, - "reward_std": 0.6080609560012817, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.798828125, - "rewards/format_reward/std": 0.4012683033943176, - "rewards/tag_count_reward/mean": 0.89697265625, - "rewards/tag_count_reward/std": 0.22449250519275665, + "grad_norm": 6.105926513671875, + "kl": 3.7109375, + "learning_rate": 5.923060263071503e-07, + "loss": 0.1926, + "num_tokens": 914090355.0, + "reward": 1.00537109375, + "reward_std": 0.2940458655357361, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.89599609375, + "rewards/tag_count_reward/std": 0.20819459855556488, "step": 1533 }, { @@ -44472,27 +44472,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 893.072265625, - "completions/mean_terminated_length": 838.75048828125, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 690.802734375, + "completions/mean_terminated_length": 677.4181518554688, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, "epoch": 0.5236835367414867, - "grad_norm": 2.535127639770508, - "kl": 4.87109375, - "learning_rate": 5.914893604201244e-07, - "loss": 0.3157, - "num_tokens": 858329081.0, - "reward": 1.82861328125, - "reward_std": 0.5903012752532959, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.90869140625, - "rewards/tag_count_reward/std": 0.2106221467256546, + "grad_norm": 2.300762176513672, + "kl": 2.90234375, + "learning_rate": 5.917720604062098e-07, + "loss": 0.2004, + "num_tokens": 914527950.0, + "reward": 1.04443359375, + "reward_std": 0.29708701372146606, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.1908532679080963, "step": 1534 }, { @@ -44501,27 +44501,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1949.0, - "completions/mean_length": 877.5078125, - "completions/mean_terminated_length": 819.9425659179688, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 1696.0, + "completions/mean_length": 658.732421875, + "completions/mean_terminated_length": 653.2843627929688, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.5240249210548775, - "grad_norm": 1.4743084907531738, - "kl": 5.9609375, - "learning_rate": 5.909555066185829e-07, - "loss": 0.3752, - "num_tokens": 858867597.0, - "reward": 1.82568359375, - "reward_std": 0.5775442719459534, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.2070624828338623, + "grad_norm": 3.5259087085723877, + "kl": 2.861328125, + "learning_rate": 5.912380351724782e-07, + "loss": 0.1767, + "num_tokens": 914954453.0, + "reward": 1.1044921875, + "reward_std": 0.2828883230686188, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.15756843984127045, "step": 1535 }, { @@ -44530,27 +44530,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 856.703125, - "completions/mean_terminated_length": 813.2955932617188, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 1611.0, + "completions/mean_length": 620.984375, + "completions/mean_terminated_length": 615.3882446289062, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.5243663053682683, - "grad_norm": 0.9794783592224121, - "kl": 6.03125, - "learning_rate": 5.904215946881907e-07, - "loss": 0.3721, - "num_tokens": 859384805.0, - "reward": 1.81201171875, - "reward_std": 0.5127238035202026, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.19733208417892456, + "grad_norm": 3.0589449405670166, + "kl": 3.87109375, + "learning_rate": 5.907039513644817e-07, + "loss": 0.236, + "num_tokens": 915350973.0, + "reward": 0.99658203125, + "reward_std": 0.2699403762817383, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.17870602011680603, "step": 1536 }, { @@ -44559,27 +44559,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 924.396484375, - "completions/mean_terminated_length": 864.2860107421875, - "completions/min_length": 39.0, - "completions/min_terminated_length": 39.0, + "completions/max_terminated_length": 1775.0, + "completions/mean_length": 686.19140625, + "completions/mean_terminated_length": 680.8510131835938, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.5247076896816592, - "grad_norm": 3.236016035079956, - "kl": 8.375, - "learning_rate": 5.898876253867379e-07, - "loss": 0.4561, - "num_tokens": 859937536.0, - "reward": 1.701171875, - "reward_std": 0.6078129410743713, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.7734375, - "rewards/format_reward/std": 0.4190165400505066, - "rewards/tag_count_reward/mean": 0.888671875, - "rewards/tag_count_reward/std": 0.23263275623321533, + "grad_norm": 3.1424612998962402, + "kl": 2.095703125, + "learning_rate": 5.901698097408299e-07, + "loss": 0.1122, + "num_tokens": 915781743.0, + "reward": 1.0087890625, + "reward_std": 0.23096433281898499, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.16588735580444336, "step": 1537 }, { @@ -44588,27 +44588,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 890.083984375, - "completions/mean_terminated_length": 828.1378173828125, - "completions/min_length": 216.0, - "completions/min_terminated_length": 216.0, + "completions/max_terminated_length": 1904.0, + "completions/mean_length": 650.544921875, + "completions/mean_terminated_length": 647.8101806640625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.52504907399505, - "grad_norm": 4.2210469245910645, - "kl": 7.7421875, - "learning_rate": 5.893535994720965e-07, - "loss": 0.4208, - "num_tokens": 860468763.0, - "reward": 1.76806640625, - "reward_std": 0.6072068214416504, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.794921875, - "rewards/format_reward/std": 0.4041535556316376, - "rewards/tag_count_reward/mean": 0.90478515625, - "rewards/tag_count_reward/std": 0.21748776733875275, + "grad_norm": 3.1720972061157227, + "kl": 3.26953125, + "learning_rate": 5.896356110602143e-07, + "loss": 0.1886, + "num_tokens": 916190326.0, + "reward": 1.07958984375, + "reward_std": 0.3388897776603699, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.1808057427406311, "step": 1538 }, { @@ -44617,27 +44617,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1937.0, - "completions/mean_length": 863.265625, - "completions/mean_terminated_length": 810.0734252929688, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1995.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 660.455078125, + "completions/mean_terminated_length": 660.455078125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.5253904583084408, - "grad_norm": 2.282278060913086, - "kl": 6.46875, - "learning_rate": 5.888195177022185e-07, - "loss": 0.356, - "num_tokens": 860986867.0, - "reward": 1.8134765625, - "reward_std": 0.5585914850234985, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.19707830250263214, + "grad_norm": 2.8052926063537598, + "kl": 2.939453125, + "learning_rate": 5.891013560814078e-07, + "loss": 0.1494, + "num_tokens": 916604591.0, + "reward": 1.03662109375, + "reward_std": 0.2868598699569702, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.18092724680900574, "step": 1539 }, { @@ -44646,27 +44646,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 837.60546875, - "completions/mean_terminated_length": 767.5826416015625, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, - "epoch": 0.5257318426218315, - "grad_norm": 2.910249710083008, - "kl": 8.0078125, - "learning_rate": 5.882853808351354e-07, - "loss": 0.4439, - "num_tokens": 861493369.0, - "reward": 1.80908203125, - "reward_std": 0.6293829679489136, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.21437686681747437, + "completions/max_terminated_length": 1921.0, + "completions/mean_length": 632.130859375, + "completions/mean_terminated_length": 629.3600463867188, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.5257318426218315, + "grad_norm": 2.220705270767212, + "kl": 3.30859375, + "learning_rate": 5.885670455632628e-07, + "loss": 0.1765, + "num_tokens": 917005890.0, + "reward": 1.05859375, + "reward_std": 0.28806042671203613, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.17398715019226074, "step": 1540 }, { @@ -44675,27 +44675,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 952.8359375, - "completions/mean_terminated_length": 901.3251342773438, - "completions/min_length": 240.0, - "completions/min_terminated_length": 240.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 699.5, + "completions/mean_terminated_length": 696.8610229492188, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, "epoch": 0.5260732269352223, - "grad_norm": 1.2507978677749634, - "kl": 7.078125, - "learning_rate": 5.877511896289566e-07, - "loss": 0.4122, - "num_tokens": 862054181.0, - "reward": 1.79638671875, - "reward_std": 0.6029015779495239, - "rewards/accuracy_reward/mean": 0.08467742055654526, - "rewards/accuracy_reward/std": 0.278682142496109, - "rewards/format_reward/mean": 0.8046875, - "rewards/format_reward/std": 0.3968288004398346, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.21104364097118378, + "grad_norm": 2.4582996368408203, + "kl": 2.91015625, + "learning_rate": 5.88032680264711e-07, + "loss": 0.1533, + "num_tokens": 917436994.0, + "reward": 1.0478515625, + "reward_std": 0.29794150590896606, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310528099536896, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.1879250556230545, "step": 1541 }, { @@ -44704,27 +44704,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 869.470703125, - "completions/mean_terminated_length": 824.0507202148438, - "completions/min_length": 46.0, - "completions/min_terminated_length": 46.0, + "completions/max_terminated_length": 1700.0, + "completions/mean_length": 623.375, + "completions/mean_terminated_length": 620.5870971679688, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, "epoch": 0.5264146112486131, - "grad_norm": 2.267066717147827, - "kl": 6.5625, - "learning_rate": 5.872169448418688e-07, - "loss": 0.4339, - "num_tokens": 862573462.0, - "reward": 1.7919921875, - "reward_std": 0.5768818855285645, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.20379072427749634, + "grad_norm": 2.449199676513672, + "kl": 3.17578125, + "learning_rate": 5.874982609447618e-07, + "loss": 0.1839, + "num_tokens": 917830274.0, + "reward": 0.99072265625, + "reward_std": 0.2553071677684784, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.1798519492149353, "step": 1542 }, { @@ -44733,27 +44733,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1937.0, - "completions/mean_length": 877.001953125, - "completions/mean_terminated_length": 816.88916015625, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 1668.0, + "completions/mean_length": 687.986328125, + "completions/mean_terminated_length": 663.6520385742188, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, "epoch": 0.5267559955620039, - "grad_norm": 1.2301084995269775, - "kl": 6.296875, - "learning_rate": 5.866826472321351e-07, - "loss": 0.414, - "num_tokens": 863109975.0, - "reward": 1.78076171875, - "reward_std": 0.5611857175827026, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.20266404747962952, + "grad_norm": 2.059253215789795, + "kl": 4.11328125, + "learning_rate": 5.869637883625013e-07, + "loss": 0.3064, + "num_tokens": 918270011.0, + "reward": 0.97412109375, + "reward_std": 0.2899189591407776, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.89599609375, + "rewards/tag_count_reward/std": 0.21907246112823486, "step": 1543 }, { @@ -44762,27 +44762,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 915.37890625, - "completions/mean_terminated_length": 871.7281494140625, - "completions/min_length": 67.0, - "completions/min_terminated_length": 67.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 671.509765625, + "completions/mean_terminated_length": 666.11181640625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, "epoch": 0.5270973798753947, - "grad_norm": 1.3303993940353394, - "kl": 5.84375, - "learning_rate": 5.861482975580928e-07, - "loss": 0.3635, - "num_tokens": 863657049.0, - "reward": 1.724609375, - "reward_std": 0.5845118165016174, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.78515625, - "rewards/format_reward/std": 0.4111155867576599, - "rewards/tag_count_reward/mean": 0.90234375, - "rewards/tag_count_reward/std": 0.21269488334655762, + "grad_norm": 4.8275465965271, + "kl": 3.26171875, + "learning_rate": 5.864292632770911e-07, + "loss": 0.1745, + "num_tokens": 918692224.0, + "reward": 0.9765625, + "reward_std": 0.23069053888320923, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.17999069392681122, "step": 1544 }, { @@ -44791,27 +44791,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 912.099609375, - "completions/mean_terminated_length": 856.235595703125, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 647.107421875, + "completions/mean_terminated_length": 641.61376953125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.5274387641887855, - "grad_norm": 2.149855375289917, - "kl": 6.6015625, - "learning_rate": 5.856138965781538e-07, - "loss": 0.373, - "num_tokens": 864208796.0, - "reward": 1.7548828125, - "reward_std": 0.5656696557998657, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.9111328125, - "rewards/tag_count_reward/std": 0.20129980146884918, + "grad_norm": 7.4420037269592285, + "kl": 4.16015625, + "learning_rate": 5.858946864477675e-07, + "loss": 0.2589, + "num_tokens": 919108295.0, + "reward": 0.974609375, + "reward_std": 0.25603586435317993, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.19091396033763885, "step": 1545 }, { @@ -44820,27 +44820,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 908.794921875, - "completions/mean_terminated_length": 845.3753051757812, - "completions/min_length": 35.0, - "completions/min_terminated_length": 35.0, + "completions/max_terminated_length": 1836.0, + "completions/mean_length": 691.767578125, + "completions/mean_terminated_length": 683.7741088867188, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, "epoch": 0.5277801485021764, - "grad_norm": 1.1292212009429932, - "kl": 7.1015625, - "learning_rate": 5.850794450508026e-07, - "loss": 0.4205, - "num_tokens": 864750387.0, - "reward": 1.69970703125, - "reward_std": 0.5838012099266052, - "rewards/accuracy_reward/mean": 0.03333333507180214, - "rewards/accuracy_reward/std": 0.17969276010990143, - "rewards/format_reward/mean": 0.7734375, - "rewards/format_reward/std": 0.4190165400505066, - "rewards/tag_count_reward/mean": 0.89501953125, - "rewards/tag_count_reward/std": 0.22138507664203644, + "grad_norm": 4.548550605773926, + "kl": 3.6484375, + "learning_rate": 5.853600586338406e-07, + "loss": 0.1979, + "num_tokens": 919538768.0, + "reward": 1.0107421875, + "reward_std": 0.28472375869750977, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24231401085853577, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.18428993225097656, "step": 1546 }, { @@ -44849,27 +44849,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 991.205078125, - "completions/mean_terminated_length": 946.0061645507812, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/max_terminated_length": 1721.0, + "completions/mean_length": 731.806640625, + "completions/mean_terminated_length": 729.2308959960938, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.5281215328155672, - "grad_norm": 0.747988224029541, - "kl": 6.203125, - "learning_rate": 5.845449437345952e-07, - "loss": 0.3744, - "num_tokens": 865338316.0, - "reward": 1.7763671875, - "reward_std": 0.546890139579773, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.20055793225765228, + "grad_norm": 17.49198341369629, + "kl": 3.734375, + "learning_rate": 5.848253805946924e-07, + "loss": 0.1958, + "num_tokens": 919993885.0, + "reward": 0.98291015625, + "reward_std": 0.2726157009601593, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.90869140625, + "rewards/tag_count_reward/std": 0.2059241086244583, "step": 1547 }, { @@ -44878,27 +44878,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 862.59765625, - "completions/mean_terminated_length": 821.8869018554688, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 656.03515625, + "completions/mean_terminated_length": 653.3111572265625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, "epoch": 0.5284629171289579, - "grad_norm": 1.1407604217529297, - "kl": 5.671875, - "learning_rate": 5.840103933881584e-07, - "loss": 0.3333, - "num_tokens": 865862510.0, - "reward": 1.79541015625, - "reward_std": 0.5912949442863464, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.8046875, - "rewards/format_reward/std": 0.3968288004398346, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.18998514115810394, + "grad_norm": 4.080907821655273, + "kl": 2.689453125, + "learning_rate": 5.842906530897763e-07, + "loss": 0.1151, + "num_tokens": 920412319.0, + "reward": 1.02197265625, + "reward_std": 0.26531165838241577, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.17812223732471466, "step": 1548 }, { @@ -44907,27 +44907,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 869.046875, - "completions/mean_terminated_length": 828.5576171875, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/max_terminated_length": 1916.0, + "completions/mean_length": 659.072265625, + "completions/mean_terminated_length": 650.8860473632812, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.5288043014423487, - "grad_norm": 1.438033103942871, - "kl": 5.171875, - "learning_rate": 5.834757947701889e-07, - "loss": 0.298, - "num_tokens": 866384982.0, - "reward": 1.78955078125, - "reward_std": 0.5551824569702148, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.18427114188671112, + "grad_norm": 2.370020866394043, + "kl": 3.7890625, + "learning_rate": 5.837558768786166e-07, + "loss": 0.2119, + "num_tokens": 920827284.0, + "reward": 1.0224609375, + "reward_std": 0.3238842189311981, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.20076745748519897, "step": 1549 }, { @@ -44936,27 +44936,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1937.0, - "completions/mean_length": 913.712890625, - "completions/mean_terminated_length": 848.0929565429688, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 674.91796875, + "completions/mean_terminated_length": 672.2308959960938, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.5291456857557395, - "grad_norm": 1.2410811185836792, - "kl": 7.703125, - "learning_rate": 5.829411486394516e-07, - "loss": 0.4634, - "num_tokens": 866937091.0, - "reward": 1.69873046875, - "reward_std": 0.6290674805641174, - "rewards/accuracy_reward/mean": 0.05645161122083664, - "rewards/accuracy_reward/std": 0.23102474212646484, - "rewards/format_reward/mean": 0.755859375, - "rewards/format_reward/std": 0.42999663949012756, - "rewards/tag_count_reward/mean": 0.88818359375, - "rewards/tag_count_reward/std": 0.2257234901189804, + "grad_norm": 6.898661136627197, + "kl": 2.802734375, + "learning_rate": 5.832210527208059e-07, + "loss": 0.1634, + "num_tokens": 921257130.0, + "reward": 0.99169921875, + "reward_std": 0.2672974467277527, + "rewards/accuracy_reward/mean": 0.07056451588869095, + "rewards/accuracy_reward/std": 0.25635457038879395, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.20408765971660614, "step": 1550 }, { @@ -44965,27 +44965,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1924.0, - "completions/mean_length": 904.966796875, - "completions/mean_terminated_length": 856.0794677734375, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1890.0, + "completions/max_terminated_length": 1890.0, + "completions/mean_length": 682.130859375, + "completions/mean_terminated_length": 682.130859375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.5294870700691303, - "grad_norm": 2.584840774536133, - "kl": 5.7578125, - "learning_rate": 5.824064557547785e-07, - "loss": 0.3879, - "num_tokens": 867483090.0, - "reward": 1.78076171875, - "reward_std": 0.5492658615112305, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.20694707334041595, + "grad_norm": 3.389392375946045, + "kl": 2.28125, + "learning_rate": 5.826861813760056e-07, + "loss": 0.1314, + "num_tokens": 921689037.0, + "reward": 1.00732421875, + "reward_std": 0.250784307718277, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.19182707369327545, "step": 1551 }, { @@ -44994,27 +44994,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 816.796875, - "completions/mean_terminated_length": 782.1846923828125, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1705.0, + "completions/max_terminated_length": 1705.0, + "completions/mean_length": 664.36328125, + "completions/mean_terminated_length": 664.36328125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.5298284543825211, - "grad_norm": 1.3334747552871704, - "kl": 6.078125, - "learning_rate": 5.81871716875069e-07, - "loss": 0.3443, - "num_tokens": 867971066.0, - "reward": 1.8603515625, - "reward_std": 0.6013159155845642, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.19242700934410095, + "grad_norm": 2.228701591491699, + "kl": 1.732421875, + "learning_rate": 5.821512636039437e-07, + "loss": 0.057, + "num_tokens": 922098967.0, + "reward": 1.06494140625, + "reward_std": 0.264367938041687, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.94970703125, + "rewards/tag_count_reward/std": 0.15013012290000916, "step": 1552 }, { @@ -45023,27 +45023,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 867.712890625, - "completions/mean_terminated_length": 819.7337036132812, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 1531.0, + "completions/mean_length": 663.96484375, + "completions/mean_terminated_length": 658.5372924804688, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.5301698386959119, - "grad_norm": 1.3950985670089722, - "kl": 7.46875, - "learning_rate": 5.813369327592867e-07, - "loss": 0.4191, - "num_tokens": 868499111.0, - "reward": 1.67822265625, - "reward_std": 0.6357830762863159, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.765625, - "rewards/format_reward/std": 0.42402184009552, - "rewards/tag_count_reward/mean": 0.88720703125, - "rewards/tag_count_reward/std": 0.2257785201072693, + "grad_norm": 2.460380792617798, + "kl": 2.2578125, + "learning_rate": 5.816163001644143e-07, + "loss": 0.1275, + "num_tokens": 922522693.0, + "reward": 1.0, + "reward_std": 0.2462298572063446, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.16101467609405518, "step": 1553 }, { @@ -45052,27 +45052,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 847.892578125, - "completions/mean_terminated_length": 799.107666015625, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1961.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 712.27734375, + "completions/mean_terminated_length": 712.27734375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.5305112230093028, - "grad_norm": 3.1379051208496094, - "kl": 6.96875, - "learning_rate": 5.808021041664599e-07, - "loss": 0.3617, - "num_tokens": 869014336.0, - "reward": 1.73681640625, - "reward_std": 0.5985440611839294, - "rewards/accuracy_reward/mean": 0.05443548411130905, - "rewards/accuracy_reward/std": 0.227104052901268, - "rewards/format_reward/mean": 0.77734375, - "rewards/format_reward/std": 0.41643625497817993, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.20623478293418884, + "grad_norm": 4.10263204574585, + "kl": 2.498046875, + "learning_rate": 5.810812918172764e-07, + "loss": 0.1645, + "num_tokens": 922968483.0, + "reward": 1.0078125, + "reward_std": 0.25087910890579224, + "rewards/accuracy_reward/mean": 0.07661290466785431, + "rewards/accuracy_reward/std": 0.2662447690963745, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.180202916264534, "step": 1554 }, { @@ -45081,27 +45081,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 850.486328125, - "completions/mean_terminated_length": 791.5921630859375, - "completions/min_length": 234.0, - "completions/min_terminated_length": 234.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1714.0, + "completions/max_terminated_length": 1714.0, + "completions/mean_length": 665.583984375, + "completions/mean_terminated_length": 665.583984375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.5308526073226936, - "grad_norm": 1.0754282474517822, - "kl": 7.5703125, - "learning_rate": 5.802672318556802e-07, - "loss": 0.451, - "num_tokens": 869526233.0, - "reward": 1.76708984375, - "reward_std": 0.589768648147583, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.90380859375, - "rewards/tag_count_reward/std": 0.21874071657657623, + "grad_norm": 1.5180892944335938, + "kl": 2.013671875, + "learning_rate": 5.805462393224526e-07, + "loss": 0.1041, + "num_tokens": 923385710.0, + "reward": 1.033203125, + "reward_std": 0.24109819531440735, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.17345911264419556, "step": 1555 }, { @@ -45110,27 +45110,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 919.279296875, - "completions/mean_terminated_length": 849.0270385742188, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 1777.0, + "completions/mean_length": 725.052734375, + "completions/mean_terminated_length": 714.6358032226562, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, "epoch": 0.5311939916360843, - "grad_norm": 3.4173245429992676, - "kl": 7.9765625, - "learning_rate": 5.797323165861007e-07, - "loss": 0.4163, - "num_tokens": 870070072.0, - "reward": 1.7373046875, - "reward_std": 0.6712255477905273, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.759765625, - "rewards/format_reward/std": 0.4276435375213623, - "rewards/tag_count_reward/mean": 0.8798828125, - "rewards/tag_count_reward/std": 0.23611247539520264, + "grad_norm": 1.8202368021011353, + "kl": 2.1796875, + "learning_rate": 5.800111434399285e-07, + "loss": 0.0826, + "num_tokens": 923830105.0, + "reward": 1.0849609375, + "reward_std": 0.2806433439254761, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.1598801612854004, "step": 1556 }, { @@ -45139,27 +45139,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 854.3359375, - "completions/mean_terminated_length": 803.2831420898438, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1812.0, + "completions/max_terminated_length": 1812.0, + "completions/mean_length": 707.529296875, + "completions/mean_terminated_length": 707.529296875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.5315353759494751, - "grad_norm": 1.6247764825820923, - "kl": 7.0859375, - "learning_rate": 5.791973591169359e-07, - "loss": 0.4521, - "num_tokens": 870587988.0, - "reward": 1.76611328125, - "reward_std": 0.6064735651016235, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.80078125, - "rewards/format_reward/std": 0.39980348944664, - "rewards/tag_count_reward/mean": 0.90283203125, - "rewards/tag_count_reward/std": 0.21774683892726898, + "grad_norm": 4.547028064727783, + "kl": 2.20703125, + "learning_rate": 5.794760049297511e-07, + "loss": 0.1632, + "num_tokens": 924272856.0, + "reward": 1.02783203125, + "reward_std": 0.25383156538009644, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.94970703125, + "rewards/tag_count_reward/std": 0.1549411565065384, "step": 1557 }, { @@ -45168,27 +45168,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1935.0, - "completions/mean_length": 899.740234375, - "completions/mean_terminated_length": 848.1856689453125, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 744.09765625, + "completions/mean_terminated_length": 728.6364135742188, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, "epoch": 0.5318767602628659, - "grad_norm": 1.4438574314117432, - "kl": 6.03125, - "learning_rate": 5.786623602074602e-07, - "loss": 0.3463, - "num_tokens": 871129135.0, - "reward": 1.767578125, - "reward_std": 0.6126869916915894, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.787109375, - "rewards/format_reward/std": 0.409751296043396, - "rewards/tag_count_reward/mean": 0.90234375, - "rewards/tag_count_reward/std": 0.21724654734134674, + "grad_norm": 3.3136181831359863, + "kl": 2.71484375, + "learning_rate": 5.78940824552028e-07, + "loss": 0.1346, + "num_tokens": 924734314.0, + "reward": 1.06298828125, + "reward_std": 0.3048211336135864, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.17677061259746552, "step": 1558 }, { @@ -45197,27 +45197,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 903.208984375, - "completions/mean_terminated_length": 856.6727294921875, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/max_terminated_length": 1924.0, + "completions/mean_length": 725.783203125, + "completions/mean_terminated_length": 723.1956787109375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.5322181445762567, - "grad_norm": 2.136967658996582, - "kl": 5.21484375, - "learning_rate": 5.781273206170065e-07, - "loss": 0.311, - "num_tokens": 871661450.0, - "reward": 1.76416015625, - "reward_std": 0.589134693145752, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.79296875, - "rewards/format_reward/std": 0.40557438135147095, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.19531220197677612, + "grad_norm": 2.5662450790405273, + "kl": 2.802734375, + "learning_rate": 5.784056030669264e-07, + "loss": 0.127, + "num_tokens": 925175787.0, + "reward": 1.02587890625, + "reward_std": 0.2736976146697998, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.17731572687625885, "step": 1559 }, { @@ -45226,27 +45226,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1958.0, - "completions/mean_length": 860.767578125, - "completions/mean_terminated_length": 841.9226684570312, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1735.0, + "completions/max_terminated_length": 1735.0, + "completions/mean_length": 679.6328125, + "completions/mean_terminated_length": 679.6328125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.5325595288896475, - "grad_norm": 2.0706610679626465, - "kl": 4.4453125, - "learning_rate": 5.775922411049657e-07, - "loss": 0.2632, - "num_tokens": 872184963.0, - "reward": 1.82861328125, - "reward_std": 0.5420112013816833, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.18174928426742554, + "grad_norm": 2.3434205055236816, + "kl": 3.35546875, + "learning_rate": 5.778703412346717e-07, + "loss": 0.1737, + "num_tokens": 925606559.0, + "reward": 1.04052734375, + "reward_std": 0.28256556391716003, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.18665657937526703, "step": 1560 }, { @@ -45255,27 +45255,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 942.40234375, - "completions/mean_terminated_length": 892.7632446289062, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1918.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 745.27734375, + "completions/mean_terminated_length": 745.27734375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.5329009132030383, - "grad_norm": 2.4473965167999268, - "kl": 6.07421875, - "learning_rate": 5.770571224307855e-07, - "loss": 0.3488, - "num_tokens": 872756385.0, - "reward": 1.701171875, - "reward_std": 0.6234464049339294, + "grad_norm": 3.3408772945404053, + "kl": 2.96875, + "learning_rate": 5.773350398155467e-07, + "loss": 0.1257, + "num_tokens": 926077053.0, + "reward": 0.97705078125, + "reward_std": 0.23241698741912842, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.763671875, - "rewards/format_reward/std": 0.42524150013923645, - "rewards/tag_count_reward/mean": 0.890625, - "rewards/tag_count_reward/std": 0.21215508878231049, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.1820959448814392, "step": 1561 }, { @@ -45284,27 +45284,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1909.0, - "completions/mean_length": 802.51171875, - "completions/mean_terminated_length": 775.1656494140625, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1672.0, + "completions/max_terminated_length": 1672.0, + "completions/mean_length": 633.19140625, + "completions/mean_terminated_length": 633.19140625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, "epoch": 0.5332422975164292, - "grad_norm": 2.3419058322906494, - "kl": 5.203125, - "learning_rate": 5.765219653539687e-07, - "loss": 0.3138, - "num_tokens": 873240055.0, - "reward": 1.82763671875, - "reward_std": 0.6118506193161011, - "rewards/accuracy_reward/mean": 0.09879032522439957, - "rewards/accuracy_reward/std": 0.2986815273761749, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.20060259103775024, + "grad_norm": 4.252152442932129, + "kl": 2.640625, + "learning_rate": 5.767996995698904e-07, + "loss": 0.1328, + "num_tokens": 926474031.0, + "reward": 1.11767578125, + "reward_std": 0.29964667558670044, + "rewards/accuracy_reward/mean": 0.1552419364452362, + "rewards/accuracy_reward/std": 0.36250078678131104, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.95947265625, + "rewards/tag_count_reward/std": 0.13064312934875488, "step": 1562 }, { @@ -45313,27 +45313,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 859.98828125, - "completions/mean_terminated_length": 806.64892578125, - "completions/min_length": 19.0, - "completions/min_terminated_length": 19.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 676.919921875, + "completions/mean_terminated_length": 674.2367553710938, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, "epoch": 0.53358368182982, - "grad_norm": 2.196070909500122, - "kl": 5.71875, - "learning_rate": 5.759867706340731e-07, - "loss": 0.3871, - "num_tokens": 873755809.0, - "reward": 1.79052734375, - "reward_std": 0.5443712472915649, - "rewards/accuracy_reward/mean": 0.04233871027827263, - "rewards/accuracy_reward/std": 0.2015640139579773, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.19778190553188324, + "grad_norm": 3.494696617126465, + "kl": 3.48828125, + "learning_rate": 5.762643212580971e-07, + "loss": 0.2076, + "num_tokens": 926896054.0, + "reward": 1.0078125, + "reward_std": 0.23257681727409363, + "rewards/accuracy_reward/mean": 0.058467742055654526, + "rewards/accuracy_reward/std": 0.23486268520355225, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.1609196960926056, "step": 1563 }, { @@ -45342,27 +45342,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 894.728515625, - "completions/mean_terminated_length": 828.0103149414062, - "completions/min_length": 114.0, - "completions/min_terminated_length": 114.0, + "completions/max_terminated_length": 1802.0, + "completions/mean_length": 696.3125, + "completions/mean_terminated_length": 691.0117797851562, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, "epoch": 0.5339250661432107, - "grad_norm": 3.4765467643737793, - "kl": 6.921875, - "learning_rate": 5.754515390307095e-07, - "loss": 0.357, - "num_tokens": 874297702.0, - "reward": 1.71923828125, - "reward_std": 0.648097038269043, - "rewards/accuracy_reward/mean": 0.09677419066429138, - "rewards/accuracy_reward/std": 0.2959485352039337, - "rewards/format_reward/mean": 0.74609375, - "rewards/format_reward/std": 0.43567025661468506, - "rewards/tag_count_reward/mean": 0.87939453125, - "rewards/tag_count_reward/std": 0.22224663197994232, + "grad_norm": 2.29425048828125, + "kl": 3.615234375, + "learning_rate": 5.757289056406148e-07, + "loss": 0.197, + "num_tokens": 927336358.0, + "reward": 1.0400390625, + "reward_std": 0.22682341933250427, + "rewards/accuracy_reward/mean": 0.09879032522439957, + "rewards/accuracy_reward/std": 0.2986815273761749, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.17166221141815186, "step": 1564 }, { @@ -45371,27 +45371,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1970.0, - "completions/mean_length": 854.525390625, - "completions/mean_terminated_length": 813.5374145507812, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/max_terminated_length": 1920.0, + "completions/mean_length": 676.64453125, + "completions/mean_terminated_length": 673.9608764648438, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.5342664504566015, - "grad_norm": 4.09050989151001, - "kl": 6.34375, - "learning_rate": 5.749162713035415e-07, - "loss": 0.3373, - "num_tokens": 874815875.0, - "reward": 1.7197265625, - "reward_std": 0.5855885744094849, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.78125, - "rewards/format_reward/std": 0.41380295157432556, - "rewards/tag_count_reward/mean": 0.9013671875, - "rewards/tag_count_reward/std": 0.19792981445789337, + "grad_norm": 1.938565969467163, + "kl": 2.9765625, + "learning_rate": 5.751934534779448e-07, + "loss": 0.1761, + "num_tokens": 927763456.0, + "reward": 1.02197265625, + "reward_std": 0.21964076161384583, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.94580078125, + "rewards/tag_count_reward/std": 0.15120825171470642, "step": 1565 }, { @@ -45400,27 +45400,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1964.0, - "completions/mean_length": 879.75, - "completions/mean_terminated_length": 824.8016357421875, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 1565.0, + "completions/mean_length": 668.9921875, + "completions/mean_terminated_length": 666.2935180664062, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, "epoch": 0.5346078347699923, - "grad_norm": 3.7021701335906982, - "kl": 6.5078125, - "learning_rate": 5.743809682122836e-07, - "loss": 0.3449, - "num_tokens": 875343107.0, - "reward": 1.7158203125, - "reward_std": 0.6114732027053833, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.7734375, - "rewards/format_reward/std": 0.4190165400505066, - "rewards/tag_count_reward/mean": 0.8994140625, - "rewards/tag_count_reward/std": 0.19632047414779663, + "grad_norm": 2.2944107055664062, + "kl": 3.90625, + "learning_rate": 5.746579655306403e-07, + "loss": 0.2228, + "num_tokens": 928182780.0, + "reward": 0.9931640625, + "reward_std": 0.230872243642807, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.1809411197900772, "step": 1566 }, { @@ -45429,27 +45429,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1967.0, - "completions/mean_length": 848.44140625, - "completions/mean_terminated_length": 812.2373657226562, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 1737.0, + "completions/mean_length": 694.568359375, + "completions/mean_terminated_length": 691.9197387695312, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, "epoch": 0.5349492190833831, - "grad_norm": 4.1037702560424805, - "kl": 5.703125, - "learning_rate": 5.738456305167007e-07, - "loss": 0.3173, - "num_tokens": 875859541.0, - "reward": 1.763671875, - "reward_std": 0.6081827878952026, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.794921875, - "rewards/format_reward/std": 0.4041535556316376, - "rewards/tag_count_reward/mean": 0.892578125, - "rewards/tag_count_reward/std": 0.2210400402545929, + "grad_norm": 3.3008804321289062, + "kl": 4.78125, + "learning_rate": 5.741224425593052e-07, + "loss": 0.2639, + "num_tokens": 928620431.0, + "reward": 1.0048828125, + "reward_std": 0.29044726490974426, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.1922282725572586, "step": 1567 }, { @@ -45458,27 +45458,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 808.1171875, - "completions/mean_terminated_length": 762.9392700195312, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/max_terminated_length": 1793.0, + "completions/mean_length": 631.501953125, + "completions/mean_terminated_length": 628.7299194335938, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, "epoch": 0.5352906033967739, - "grad_norm": 1.329008936882019, - "kl": 5.921875, - "learning_rate": 5.733102589766068e-07, - "loss": 0.3459, - "num_tokens": 876348289.0, - "reward": 1.8056640625, - "reward_std": 0.6167274117469788, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.80078125, - "rewards/format_reward/std": 0.39980348944664, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.19349662959575653, + "grad_norm": 2.251715660095215, + "kl": 3.39453125, + "learning_rate": 5.735868853245934e-07, + "loss": 0.1711, + "num_tokens": 929018752.0, + "reward": 1.08203125, + "reward_std": 0.25112995505332947, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.15868334472179413, "step": 1568 }, { @@ -45487,27 +45487,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 885.88671875, - "completions/mean_terminated_length": 836.183349609375, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1619.0, + "completions/mean_length": 632.630859375, + "completions/mean_terminated_length": 627.0804443359375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, "epoch": 0.5356319877101647, - "grad_norm": 1.266638159751892, - "kl": 5.7734375, - "learning_rate": 5.727748543518637e-07, - "loss": 0.3505, - "num_tokens": 876871111.0, - "reward": 1.81103515625, - "reward_std": 0.5950206518173218, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.20682698488235474, + "grad_norm": 3.9713985919952393, + "kl": 3.5625, + "learning_rate": 5.73051294587207e-07, + "loss": 0.219, + "num_tokens": 929411907.0, + "reward": 1.0361328125, + "reward_std": 0.27317631244659424, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.16808471083641052, "step": 1569 }, { @@ -45516,27 +45516,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 885.673828125, - "completions/mean_terminated_length": 845.755615234375, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/max_terminated_length": 1326.0, + "completions/mean_length": 640.427734375, + "completions/mean_terminated_length": 637.6731567382812, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, "epoch": 0.5359733720235555, - "grad_norm": 1.0821884870529175, - "kl": 5.4140625, - "learning_rate": 5.722394174023805e-07, - "loss": 0.3236, - "num_tokens": 877401328.0, - "reward": 1.78466796875, - "reward_std": 0.5873010158538818, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.19800402224063873, + "grad_norm": 1.646174430847168, + "kl": 2.400390625, + "learning_rate": 5.725156711078961e-07, + "loss": 0.1073, + "num_tokens": 929816558.0, + "reward": 1.0400390625, + "reward_std": 0.24709290266036987, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.17521031200885773, "step": 1570 }, { @@ -45545,27 +45545,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 856.93359375, - "completions/mean_terminated_length": 813.534423828125, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/max_terminated_length": 1695.0, + "completions/mean_length": 671.77734375, + "completions/mean_terminated_length": 660.94091796875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, "epoch": 0.5363147563369464, - "grad_norm": 1.3648273944854736, - "kl": 5.0078125, - "learning_rate": 5.717039488881118e-07, - "loss": 0.2963, - "num_tokens": 877919006.0, - "reward": 1.833984375, - "reward_std": 0.602922797203064, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.19604040682315826, + "grad_norm": 3.056997776031494, + "kl": 2.83984375, + "learning_rate": 5.71980015647457e-07, + "loss": 0.1345, + "num_tokens": 930239436.0, + "reward": 1.10791015625, + "reward_std": 0.32693377137184143, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.1689939647912979, "step": 1571 }, { @@ -45574,27 +45574,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1970.0, - "completions/mean_length": 790.580078125, - "completions/mean_terminated_length": 742.11962890625, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 618.6640625, + "completions/mean_terminated_length": 615.866943359375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, "epoch": 0.5366561406503371, - "grad_norm": 2.4586451053619385, - "kl": 5.109375, - "learning_rate": 5.711684495690573e-07, - "loss": 0.3362, - "num_tokens": 878399975.0, - "reward": 1.8408203125, - "reward_std": 0.5516421794891357, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.18873685598373413, + "grad_norm": 4.347153186798096, + "kl": 2.822265625, + "learning_rate": 5.714443289667318e-07, + "loss": 0.1825, + "num_tokens": 930632384.0, + "reward": 1.033203125, + "reward_std": 0.26346203684806824, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.18010744452476501, "step": 1572 }, { @@ -45603,27 +45603,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 891.001953125, - "completions/mean_terminated_length": 839.0550537109375, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 1662.0, + "completions/mean_length": 700.412109375, + "completions/mean_terminated_length": 692.4695434570312, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.5369975249637279, - "grad_norm": 1.2200103998184204, - "kl": 4.88671875, - "learning_rate": 5.706329202052605e-07, - "loss": 0.3001, - "num_tokens": 878926104.0, - "reward": 1.7900390625, - "reward_std": 0.5616840124130249, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.19204923510551453, + "grad_norm": 3.6437830924987793, + "kl": 1.94921875, + "learning_rate": 5.709086118266069e-07, + "loss": 0.1119, + "num_tokens": 931060931.0, + "reward": 1.01611328125, + "reward_std": 0.22769811749458313, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.94189453125, + "rewards/tag_count_reward/std": 0.1600138396024704, "step": 1573 }, { @@ -45632,27 +45632,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 920.75, - "completions/mean_terminated_length": 855.5371704101562, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 1870.0, + "completions/mean_length": 680.13671875, + "completions/mean_terminated_length": 672.07470703125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.5373389092771187, - "grad_norm": 2.1945228576660156, - "kl": 6.25390625, - "learning_rate": 5.700973615568072e-07, - "loss": 0.3679, - "num_tokens": 879475128.0, - "reward": 1.72412109375, - "reward_std": 0.6320927739143372, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.779296875, - "rewards/format_reward/std": 0.4151262938976288, - "rewards/tag_count_reward/mean": 0.89599609375, - "rewards/tag_count_reward/std": 0.21626292169094086, + "grad_norm": 1.8281835317611694, + "kl": 2.421875, + "learning_rate": 5.703728649880113e-07, + "loss": 0.1352, + "num_tokens": 931486761.0, + "reward": 1.0400390625, + "reward_std": 0.2658703923225403, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.15593473613262177, "step": 1574 }, { @@ -45661,27 +45661,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1978.0, - "completions/mean_length": 851.12890625, - "completions/mean_terminated_length": 819.9478759765625, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 1710.0, + "completions/mean_length": 672.015625, + "completions/mean_terminated_length": 666.61962890625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.5376802935905095, - "grad_norm": 0.8738031983375549, - "kl": 4.16015625, - "learning_rate": 5.695617743838252e-07, - "loss": 0.2279, - "num_tokens": 879985882.0, - "reward": 1.81005859375, - "reward_std": 0.5117999315261841, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.16270823776721954, + "grad_norm": 2.4716744422912598, + "kl": 2.361328125, + "learning_rate": 5.698370892119171e-07, + "loss": 0.1364, + "num_tokens": 931905809.0, + "reward": 1.02880859375, + "reward_std": 0.2446683943271637, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.1736346036195755, "step": 1575 }, { @@ -45690,27 +45690,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 856.6484375, - "completions/mean_terminated_length": 813.2388916015625, - "completions/min_length": 59.0, - "completions/min_terminated_length": 59.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1673.0, + "completions/max_terminated_length": 1673.0, + "completions/mean_length": 657.873046875, + "completions/mean_terminated_length": 657.873046875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.5380216779039003, - "grad_norm": 1.9138249158859253, - "kl": 4.5625, - "learning_rate": 5.690261594464824e-07, - "loss": 0.2513, - "num_tokens": 880506678.0, - "reward": 1.74462890625, - "reward_std": 0.4995850920677185, - "rewards/accuracy_reward/mean": 0.013671875, - "rewards/accuracy_reward/std": 0.1162383034825325, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.19446396827697754, + "grad_norm": 3.619882345199585, + "kl": 2.109375, + "learning_rate": 5.693012852593369e-07, + "loss": 0.1756, + "num_tokens": 932324832.0, + "reward": 0.96728515625, + "reward_std": 0.2092210203409195, + "rewards/accuracy_reward/mean": 0.025390625, + "rewards/accuracy_reward/std": 0.15746226906776428, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.18353331089019775, "step": 1576 }, { @@ -45719,27 +45719,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 842.876953125, - "completions/mean_terminated_length": 786.1942749023438, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 1892.0, + "completions/mean_length": 667.80078125, + "completions/mean_terminated_length": 665.0997924804688, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.5383630622172911, - "grad_norm": 1.5206162929534912, - "kl": 5.23828125, - "learning_rate": 5.68490517504986e-07, - "loss": 0.327, - "num_tokens": 881008359.0, - "reward": 1.80322265625, - "reward_std": 0.5785903930664062, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.20618844032287598, + "grad_norm": 2.964077949523926, + "kl": 2.154296875, + "learning_rate": 5.687654538913238e-07, + "loss": 0.134, + "num_tokens": 932736874.0, + "reward": 1.08056640625, + "reward_std": 0.2898910343647003, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.171859011054039, "step": 1577 }, { @@ -45748,27 +45748,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 905.759765625, - "completions/mean_terminated_length": 868.9132690429688, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 1748.0, + "completions/mean_length": 673.638671875, + "completions/mean_terminated_length": 665.538330078125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, "epoch": 0.538704446530682, - "grad_norm": 1.0344030857086182, - "kl": 4.80078125, - "learning_rate": 5.67954849319582e-07, - "loss": 0.2771, - "num_tokens": 881549436.0, - "reward": 1.78466796875, - "reward_std": 0.539876401424408, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.19167259335517883, + "grad_norm": 4.497527122497559, + "kl": 2.55078125, + "learning_rate": 5.682295958689691e-07, + "loss": 0.1784, + "num_tokens": 933159105.0, + "reward": 1.0068359375, + "reward_std": 0.2176477611064911, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.1702537089586258, "step": 1578 }, { @@ -45777,27 +45777,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1861.0, - "completions/mean_length": 857.669921875, - "completions/mean_terminated_length": 829.10205078125, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 651.271484375, + "completions/mean_terminated_length": 645.7941284179688, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.5390458308440728, - "grad_norm": 3.1057684421539307, - "kl": 4.00390625, - "learning_rate": 5.674191556505533e-07, - "loss": 0.2752, - "num_tokens": 882069283.0, - "reward": 1.8603515625, - "reward_std": 0.5147795677185059, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.1606195569038391, + "grad_norm": 2.422194719314575, + "kl": 3.2578125, + "learning_rate": 5.676937119534027e-07, + "loss": 0.1685, + "num_tokens": 933573276.0, + "reward": 0.9892578125, + "reward_std": 0.25339266657829285, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.1872730702161789, "step": 1579 }, { @@ -45806,27 +45806,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 868.314453125, - "completions/mean_terminated_length": 825.3299560546875, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 670.515625, + "completions/mean_terminated_length": 667.8199462890625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.5393872151574635, - "grad_norm": 1.5593984127044678, - "kl": 5.58203125, - "learning_rate": 5.668834372582195e-07, - "loss": 0.3297, - "num_tokens": 882602084.0, - "reward": 1.7783203125, - "reward_std": 0.5504990816116333, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.19395042955875397, + "grad_norm": 1.8832250833511353, + "kl": 3.607421875, + "learning_rate": 5.67157802905791e-07, + "loss": 0.2155, + "num_tokens": 934004804.0, + "reward": 0.99267578125, + "reward_std": 0.24724173545837402, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.1766246110200882, "step": 1580 }, { @@ -45835,27 +45835,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1879.0, - "completions/mean_length": 876.46875, - "completions/mean_terminated_length": 831.3184204101562, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1693.0, + "completions/max_terminated_length": 1693.0, + "completions/mean_length": 657.982421875, + "completions/mean_terminated_length": 657.982421875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.5397285994708543, - "grad_norm": 1.1250920295715332, - "kl": 5.21875, - "learning_rate": 5.663476949029342e-07, - "loss": 0.3066, - "num_tokens": 883135796.0, - "reward": 1.77001953125, - "reward_std": 0.5458778738975525, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.19894284009933472, + "grad_norm": 3.1322691440582275, + "kl": 3.453125, + "learning_rate": 5.666218694873359e-07, + "loss": 0.1815, + "num_tokens": 934426651.0, + "reward": 1.0498046875, + "reward_std": 0.29360878467559814, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.17023125290870667, "step": 1581 }, { @@ -45864,27 +45864,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 795.607421875, - "completions/mean_terminated_length": 768.1098022460938, - "completions/min_length": 81.0, - "completions/min_terminated_length": 81.0, + "completions/max_terminated_length": 1405.0, + "completions/mean_length": 614.40625, + "completions/mean_terminated_length": 608.7843627929688, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, "epoch": 0.5400699837842451, - "grad_norm": 1.7664976119995117, - "kl": 4.20703125, - "learning_rate": 5.65811929345086e-07, - "loss": 0.2241, - "num_tokens": 883626971.0, - "reward": 1.8818359375, - "reward_std": 0.5036077499389648, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.192208394408226, + "grad_norm": 2.205714702606201, + "kl": 2.830078125, + "learning_rate": 5.660859124592744e-07, + "loss": 0.1742, + "num_tokens": 934825051.0, + "reward": 1.0947265625, + "reward_std": 0.33375608921051025, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.17776581645011902, "step": 1582 }, { @@ -45893,27 +45893,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1958.0, - "completions/mean_length": 846.677734375, - "completions/mean_terminated_length": 805.4202270507812, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1512.0, + "completions/max_terminated_length": 1512.0, + "completions/mean_length": 625.884765625, + "completions/mean_terminated_length": 625.884765625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, "epoch": 0.5404113680976359, - "grad_norm": 1.2929701805114746, - "kl": 4.62890625, - "learning_rate": 5.652761413450965e-07, - "loss": 0.2653, - "num_tokens": 884126726.0, - "reward": 1.8408203125, - "reward_std": 0.4792023003101349, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.1666000783443451, + "grad_norm": 5.821282386779785, + "kl": 3.55859375, + "learning_rate": 5.655499325828763e-07, + "loss": 0.1766, + "num_tokens": 935211760.0, + "reward": 1.001953125, + "reward_std": 0.2657081186771393, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.17862646281719208, "step": 1583 }, { @@ -45922,27 +45922,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 820.994140625, - "completions/mean_terminated_length": 781.4132690429688, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1801.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 626.85546875, + "completions/mean_terminated_length": 626.85546875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.5407527524110267, - "grad_norm": 3.424382209777832, - "kl": 4.8671875, - "learning_rate": 5.647403316634181e-07, - "loss": 0.3484, - "num_tokens": 884621955.0, - "reward": 1.8779296875, - "reward_std": 0.48530155420303345, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.1672869324684143, + "grad_norm": 3.5651533603668213, + "kl": 3.34375, + "learning_rate": 5.650139306194448e-07, + "loss": 0.1696, + "num_tokens": 935607590.0, + "reward": 1.03564453125, + "reward_std": 0.2358933985233307, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.17041848599910736, "step": 1584 }, { @@ -45951,27 +45951,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 845.580078125, - "completions/mean_terminated_length": 809.2897338867188, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/max_terminated_length": 1509.0, + "completions/mean_length": 589.11328125, + "completions/mean_terminated_length": 583.3922119140625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.5410941367244175, - "grad_norm": 0.9744515419006348, - "kl": 5.57421875, - "learning_rate": 5.64204501060535e-07, - "loss": 0.3258, - "num_tokens": 885142044.0, - "reward": 1.87158203125, - "reward_std": 0.5376770496368408, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.1841985285282135, + "grad_norm": 4.280489444732666, + "kl": 3.0, + "learning_rate": 5.644779073303136e-07, + "loss": 0.2015, + "num_tokens": 935996368.0, + "reward": 1.05859375, + "reward_std": 0.25045138597488403, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.947265625, + "rewards/tag_count_reward/std": 0.15132275223731995, "step": 1585 }, { @@ -45980,27 +45980,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1858.0, - "completions/mean_length": 810.689453125, - "completions/mean_terminated_length": 768.1959838867188, - "completions/min_length": 226.0, - "completions/min_terminated_length": 226.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1463.0, + "completions/max_terminated_length": 1463.0, + "completions/mean_length": 569.9296875, + "completions/mean_terminated_length": 569.9296875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, "epoch": 0.5414355210378083, - "grad_norm": 0.995830774307251, - "kl": 5.50390625, - "learning_rate": 5.636686502969606e-07, - "loss": 0.3226, - "num_tokens": 885636301.0, - "reward": 1.837890625, - "reward_std": 0.5120077133178711, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.18155530095100403, + "grad_norm": 4.351285934448242, + "kl": 3.173828125, + "learning_rate": 5.639418634768474e-07, + "loss": 0.1394, + "num_tokens": 936367356.0, + "reward": 1.0302734375, + "reward_std": 0.2381097376346588, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.162770614027977, "step": 1586 }, { @@ -46009,27 +46009,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 823.3671875, - "completions/mean_terminated_length": 783.8628540039062, - "completions/min_length": 12.0, - "completions/min_terminated_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1689.0, + "completions/max_terminated_length": 1689.0, + "completions/mean_length": 596.37890625, + "completions/mean_terminated_length": 596.37890625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, "epoch": 0.5417769053511992, - "grad_norm": 1.4965887069702148, - "kl": 4.90234375, - "learning_rate": 5.631327801332373e-07, - "loss": 0.3153, - "num_tokens": 886141625.0, - "reward": 1.87939453125, - "reward_std": 0.5356603860855103, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.17327652871608734, + "grad_norm": 4.0819220542907715, + "kl": 2.69921875, + "learning_rate": 5.634057998204392e-07, + "loss": 0.1478, + "num_tokens": 936756462.0, + "reward": 1.0498046875, + "reward_std": 0.22694355249404907, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.1693984717130661, "step": 1587 }, { @@ -46038,27 +46038,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 846.3046875, - "completions/mean_terminated_length": 787.2048950195312, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 610.421875, + "completions/mean_terminated_length": 599.1023559570312, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.5421182896645899, - "grad_norm": 1.4262303113937378, - "kl": 6.828125, - "learning_rate": 5.625968913299344e-07, - "loss": 0.4179, - "num_tokens": 886654533.0, - "reward": 1.83349609375, - "reward_std": 0.5428067445755005, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.1952926218509674, + "grad_norm": 3.5612952709198, + "kl": 4.578125, + "learning_rate": 5.628697171225113e-07, + "loss": 0.275, + "num_tokens": 937148598.0, + "reward": 1.01904296875, + "reward_std": 0.2345481663942337, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.94091796875, + "rewards/tag_count_reward/std": 0.1678706705570221, "step": 1588 }, { @@ -46067,27 +46067,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 887.185546875, - "completions/mean_terminated_length": 847.3192138671875, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1729.0, + "completions/max_terminated_length": 1729.0, + "completions/mean_length": 628.037109375, + "completions/mean_terminated_length": 628.037109375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.5424596739779807, - "grad_norm": 1.3533570766448975, - "kl": 5.72265625, - "learning_rate": 5.620609846476486e-07, - "loss": 0.334, - "num_tokens": 887184228.0, - "reward": 1.83447265625, - "reward_std": 0.4780813157558441, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.18597961962223053, + "grad_norm": 3.1354148387908936, + "kl": 2.61328125, + "learning_rate": 5.623336161445123e-07, + "loss": 0.1498, + "num_tokens": 937545609.0, + "reward": 1.02685546875, + "reward_std": 0.2647894024848938, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.1686147004365921, "step": 1589 }, { @@ -46096,27 +46096,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1926.0, - "completions/mean_length": 902.568359375, - "completions/mean_terminated_length": 836.3037109375, - "completions/min_length": 25.0, - "completions/min_terminated_length": 25.0, + "completions/max_terminated_length": 1760.0, + "completions/mean_length": 671.560546875, + "completions/mean_terminated_length": 663.4479370117188, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.5428010582913715, - "grad_norm": 1.906775712966919, - "kl": 6.6953125, - "learning_rate": 5.615250608470009e-07, - "loss": 0.3857, - "num_tokens": 887721623.0, - "reward": 1.7998046875, - "reward_std": 0.6019806861877441, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.798828125, - "rewards/format_reward/std": 0.4012683033943176, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.21143096685409546, + "grad_norm": 2.8441548347473145, + "kl": 2.458984375, + "learning_rate": 5.617974976479163e-07, + "loss": 0.1586, + "num_tokens": 937964728.0, + "reward": 1.0673828125, + "reward_std": 0.24446213245391846, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.15908929705619812, "step": 1590 }, { @@ -46125,27 +46125,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 839.203125, - "completions/mean_terminated_length": 800.2096557617188, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 616.31640625, + "completions/mean_terminated_length": 610.7020263671875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, "epoch": 0.5431424426047623, - "grad_norm": 1.6059449911117554, - "kl": 6.1484375, - "learning_rate": 5.609891206886373e-07, - "loss": 0.3718, - "num_tokens": 888228847.0, - "reward": 1.8369140625, - "reward_std": 0.5652753710746765, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.1939898282289505, + "grad_norm": 4.353187561035156, + "kl": 3.30859375, + "learning_rate": 5.612613623942238e-07, + "loss": 0.2417, + "num_tokens": 938357834.0, + "reward": 1.01708984375, + "reward_std": 0.22368595004081726, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.94677734375, + "rewards/tag_count_reward/std": 0.16396017372608185, "step": 1591 }, { @@ -46154,27 +46154,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 895.525390625, - "completions/mean_terminated_length": 843.7816162109375, - "completions/min_length": 62.0, - "completions/min_terminated_length": 62.0, + "completions/max_terminated_length": 1519.0, + "completions/mean_length": 655.404296875, + "completions/mean_terminated_length": 652.6790771484375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.5434838269181531, - "grad_norm": 2.739230155944824, - "kl": 7.375, - "learning_rate": 5.604531649332267e-07, - "loss": 0.4237, - "num_tokens": 888770572.0, - "reward": 1.74609375, - "reward_std": 0.6087614893913269, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.8984375, - "rewards/tag_count_reward/std": 0.21258702874183655, + "grad_norm": 10.148595809936523, + "kl": 2.796875, + "learning_rate": 5.607252111449578e-07, + "loss": 0.206, + "num_tokens": 938776617.0, + "reward": 1.03076171875, + "reward_std": 0.29847821593284607, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.1981005072593689, "step": 1592 }, { @@ -46183,27 +46183,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 937.939453125, - "completions/mean_terminated_length": 880.954833984375, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, - "epoch": 0.5438252112315439, - "grad_norm": 1.3750349283218384, - "kl": 7.7890625, - "learning_rate": 5.599171943414605e-07, - "loss": 0.4701, - "num_tokens": 889328013.0, - "reward": 1.8046875, - "reward_std": 0.6562377214431763, - "rewards/accuracy_reward/mean": 0.11491935700178146, - "rewards/accuracy_reward/std": 0.3192465901374817, - "rewards/format_reward/mean": 0.796875, - "rewards/format_reward/std": 0.4027182459831238, - "rewards/tag_count_reward/mean": 0.896484375, - "rewards/tag_count_reward/std": 0.22124744951725006, + "completions/max_terminated_length": 1884.0, + "completions/mean_length": 644.275390625, + "completions/mean_terminated_length": 638.7706298828125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.5438252112315439, + "grad_norm": 5.837143898010254, + "kl": 2.388671875, + "learning_rate": 5.601890446616641e-07, + "loss": 0.1723, + "num_tokens": 939183702.0, + "reward": 1.05517578125, + "reward_std": 0.2509949803352356, + "rewards/accuracy_reward/mean": 0.11693548411130905, + "rewards/accuracy_reward/std": 0.3216678202152252, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.1696992814540863, "step": 1593 }, { @@ -46212,27 +46212,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 888.693359375, - "completions/mean_terminated_length": 829.1807250976562, - "completions/min_length": 40.0, - "completions/min_terminated_length": 40.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1744.0, + "completions/max_terminated_length": 1744.0, + "completions/mean_length": 636.806640625, + "completions/mean_terminated_length": 636.806640625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.5441665955449347, - "grad_norm": 1.3322317600250244, - "kl": 5.6015625, - "learning_rate": 5.593812096740507e-07, - "loss": 0.3468, - "num_tokens": 889863520.0, - "reward": 1.83154296875, - "reward_std": 0.5643051266670227, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20408765971660614, + "grad_norm": 4.291261196136475, + "kl": 2.58984375, + "learning_rate": 5.596528637059109e-07, + "loss": 0.148, + "num_tokens": 939590243.0, + "reward": 1.044921875, + "reward_std": 0.21930867433547974, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.16327762603759766, "step": 1594 }, { @@ -46241,27 +46241,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 879.61328125, - "completions/mean_terminated_length": 814.569091796875, - "completions/min_length": 53.0, - "completions/min_terminated_length": 53.0, + "completions/max_terminated_length": 1916.0, + "completions/mean_length": 642.603515625, + "completions/mean_terminated_length": 639.8532104492188, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, "epoch": 0.5445079798583256, - "grad_norm": 1.2477127313613892, - "kl": 6.1640625, - "learning_rate": 5.588452116917299e-07, - "loss": 0.4084, - "num_tokens": 890394282.0, - "reward": 1.787109375, - "reward_std": 0.5605219602584839, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.900390625, - "rewards/tag_count_reward/std": 0.22358457744121552, + "grad_norm": 3.9532883167266846, + "kl": 3.7109375, + "learning_rate": 5.591166690392863e-07, + "loss": 0.2039, + "num_tokens": 939999656.0, + "reward": 1.01416015625, + "reward_std": 0.2936437726020813, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.17593075335025787, "step": 1595 }, { @@ -46270,27 +46270,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1921.0, - "completions/mean_length": 807.458984375, - "completions/mean_terminated_length": 770.01806640625, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2040.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 598.783203125, + "completions/mean_terminated_length": 598.783203125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.5448493641717163, - "grad_norm": 2.605314254760742, - "kl": 4.890625, - "learning_rate": 5.583092011552487e-07, - "loss": 0.3089, - "num_tokens": 890893653.0, - "reward": 1.7900390625, - "reward_std": 0.5496975183486938, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.19663172960281372, + "grad_norm": 2.306550979614258, + "kl": 3.36328125, + "learning_rate": 5.585804614233981e-07, + "loss": 0.2171, + "num_tokens": 940392185.0, + "reward": 1.02978515625, + "reward_std": 0.2356104552745819, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.94580078125, + "rewards/tag_count_reward/std": 0.16438506543636322, "step": 1596 }, { @@ -46299,27 +46299,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 864.05078125, - "completions/mean_terminated_length": 800.7119140625, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1727.0, + "completions/max_terminated_length": 1727.0, + "completions/mean_length": 631.453125, + "completions/mean_terminated_length": 631.453125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.5451907484851071, - "grad_norm": 2.446843385696411, - "kl": 5.05859375, - "learning_rate": 5.577731788253762e-07, - "loss": 0.3345, - "num_tokens": 891413807.0, - "reward": 1.8291015625, - "reward_std": 0.5291173458099365, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.20217134058475494, + "grad_norm": 2.2762372493743896, + "kl": 3.6015625, + "learning_rate": 5.580442416198725e-07, + "loss": 0.238, + "num_tokens": 940793249.0, + "reward": 1.01904296875, + "reward_std": 0.27855026721954346, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.17281270027160645, "step": 1597 }, { @@ -46328,27 +46328,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 829.580078125, - "completions/mean_terminated_length": 782.6226806640625, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1770.0, + "completions/max_terminated_length": 1770.0, + "completions/mean_length": 563.125, + "completions/mean_terminated_length": 563.125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, "epoch": 0.5455321327984979, - "grad_norm": 2.638470411300659, - "kl": 6.1953125, - "learning_rate": 5.572371454628981e-07, - "loss": 0.4269, - "num_tokens": 891920488.0, - "reward": 1.7822265625, - "reward_std": 0.5422258377075195, - "rewards/accuracy_reward/mean": 0.0234375, - "rewards/accuracy_reward/std": 0.15143637359142303, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.20215243101119995, + "grad_norm": 1.781742811203003, + "kl": 2.734375, + "learning_rate": 5.575080103903531e-07, + "loss": 0.1486, + "num_tokens": 941163505.0, + "reward": 0.9833984375, + "reward_std": 0.20317822694778442, + "rewards/accuracy_reward/mean": 0.037109375, + "rewards/accuracy_reward/std": 0.18921469151973724, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.16050052642822266, "step": 1598 }, { @@ -46357,27 +46357,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 870.962890625, - "completions/mean_terminated_length": 792.4937744140625, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 1870.0, + "completions/mean_length": 618.916015625, + "completions/mean_terminated_length": 610.4931640625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, "epoch": 0.5458735171118887, - "grad_norm": 1.2676059007644653, - "kl": 7.953125, - "learning_rate": 5.567011018286159e-07, - "loss": 0.486, - "num_tokens": 892442341.0, - "reward": 1.80859375, - "reward_std": 0.6159893274307251, + "grad_norm": 3.427180051803589, + "kl": 4.1484375, + "learning_rate": 5.569717684964992e-07, + "loss": 0.2369, + "num_tokens": 941556310.0, + "reward": 1.05126953125, + "reward_std": 0.3021742105484009, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.89453125, - "rewards/tag_count_reward/std": 0.22471851110458374, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.17523416876792908, "step": 1599 }, { @@ -46386,27 +46386,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1970.0, - "completions/mean_length": 940.267578125, - "completions/mean_terminated_length": 876.183837890625, - "completions/min_length": 41.0, - "completions/min_terminated_length": 41.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1612.0, + "completions/max_terminated_length": 1612.0, + "completions/mean_length": 626.498046875, + "completions/mean_terminated_length": 626.498046875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.5462149014252795, - "grad_norm": 1.526657223701477, - "kl": 7.765625, - "learning_rate": 5.56165048683345e-07, - "loss": 0.4601, - "num_tokens": 892994782.0, - "reward": 1.76025390625, - "reward_std": 0.6191115379333496, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.89306640625, - "rewards/tag_count_reward/std": 0.22592659294605255, + "grad_norm": 1.9331140518188477, + "kl": 3.1015625, + "learning_rate": 5.564355166999862e-07, + "loss": 0.1831, + "num_tokens": 941948101.0, + "reward": 1.01513671875, + "reward_std": 0.25616538524627686, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.92919921875, + "rewards/tag_count_reward/std": 0.18304325640201569, "step": 1600 }, { @@ -46415,27 +46415,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 893.10546875, - "completions/mean_terminated_length": 831.3209838867188, - "completions/min_length": 87.0, - "completions/min_terminated_length": 87.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1891.0, + "completions/max_terminated_length": 1891.0, + "completions/mean_length": 617.056640625, + "completions/mean_terminated_length": 617.056640625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, "epoch": 0.5465562857386703, - "grad_norm": 2.4214370250701904, - "kl": 7.84375, - "learning_rate": 5.556289867879155e-07, - "loss": 0.4828, - "num_tokens": 893529188.0, - "reward": 1.77783203125, - "reward_std": 0.5602582693099976, - "rewards/accuracy_reward/mean": 0.04233871027827263, - "rewards/accuracy_reward/std": 0.2015640139579773, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.2166028767824173, + "grad_norm": 1.7783819437026978, + "kl": 4.33984375, + "learning_rate": 5.558992557625028e-07, + "loss": 0.2827, + "num_tokens": 942341170.0, + "reward": 0.97998046875, + "reward_std": 0.21821710467338562, + "rewards/accuracy_reward/mean": 0.04435483738780022, + "rewards/accuracy_reward/std": 0.2060900777578354, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.1783849000930786, "step": 1601 }, { @@ -46444,27 +46444,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 900.703125, - "completions/mean_terminated_length": 836.8330078125, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 1564.0, + "completions/mean_length": 605.259765625, + "completions/mean_terminated_length": 602.4364013671875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.5468976700520611, - "grad_norm": 1.773688554763794, - "kl": 7.90625, - "learning_rate": 5.550929169031685e-07, - "loss": 0.4677, - "num_tokens": 894065436.0, - "reward": 1.76171875, - "reward_std": 0.6085403561592102, - "rewards/accuracy_reward/mean": 0.052419353276491165, - "rewards/accuracy_reward/std": 0.22309619188308716, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.90234375, - "rewards/tag_count_reward/std": 0.2211524099111557, + "grad_norm": 1.6121258735656738, + "kl": 2.1640625, + "learning_rate": 5.553629864457507e-07, + "loss": 0.139, + "num_tokens": 942726151.0, + "reward": 1.0244140625, + "reward_std": 0.22393229603767395, + "rewards/accuracy_reward/mean": 0.06653226166963577, + "rewards/accuracy_reward/std": 0.2494617998600006, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9580078125, + "rewards/tag_count_reward/std": 0.13434022665023804, "step": 1602 }, { @@ -46473,27 +46473,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 908.662109375, - "completions/mean_terminated_length": 837.7490234375, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 1547.0, + "completions/mean_length": 609.107421875, + "completions/mean_terminated_length": 603.4647216796875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, "epoch": 0.547239054365452, - "grad_norm": 1.2073875665664673, - "kl": 7.5625, - "learning_rate": 5.545568397899575e-07, - "loss": 0.4764, - "num_tokens": 894609439.0, - "reward": 1.75244140625, - "reward_std": 0.5862622261047363, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.90283203125, - "rewards/tag_count_reward/std": 0.22164389491081238, + "grad_norm": 4.072652816772461, + "kl": 3.55859375, + "learning_rate": 5.54826709511444e-07, + "loss": 0.2032, + "num_tokens": 943116782.0, + "reward": 1.03662109375, + "reward_std": 0.27624383568763733, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.16338804364204407, "step": 1603 }, { @@ -46502,27 +46502,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0703125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 881.427734375, - "completions/mean_terminated_length": 793.1996459960938, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 621.376953125, + "completions/mean_terminated_length": 618.5851440429688, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, "epoch": 0.5475804386788428, - "grad_norm": 1.180014729499817, - "kl": 8.765625, - "learning_rate": 5.540207562091459e-07, - "loss": 0.5739, - "num_tokens": 895143242.0, - "reward": 1.7490234375, - "reward_std": 0.611189603805542, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.8916015625, - "rewards/tag_count_reward/std": 0.23401492834091187, + "grad_norm": 5.0337090492248535, + "kl": 3.515625, + "learning_rate": 5.542904257213072e-07, + "loss": 0.2045, + "num_tokens": 943517439.0, + "reward": 1.05712890625, + "reward_std": 0.262310266494751, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.16754020750522614, "step": 1604 }, { @@ -46531,27 +46531,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1982.0, - "completions/mean_length": 845.85546875, - "completions/mean_terminated_length": 807.0765991210938, - "completions/min_length": 229.0, - "completions/min_terminated_length": 229.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1600.0, + "completions/max_terminated_length": 1600.0, + "completions/mean_length": 582.892578125, + "completions/mean_terminated_length": 582.892578125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.5479218229922335, - "grad_norm": 3.5608949661254883, - "kl": 5.453125, - "learning_rate": 5.534846669216062e-07, - "loss": 0.3658, - "num_tokens": 895648160.0, - "reward": 1.837890625, - "reward_std": 0.5191460847854614, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.19130395352840424, + "grad_norm": 2.89163875579834, + "kl": 2.51953125, + "learning_rate": 5.537541358370747e-07, + "loss": 0.1341, + "num_tokens": 943887720.0, + "reward": 1.0263671875, + "reward_std": 0.20927375555038452, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9521484375, + "rewards/tag_count_reward/std": 0.15213829278945923, "step": 1605 }, { @@ -46560,27 +46560,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1976.0, - "completions/mean_length": 850.73828125, - "completions/mean_terminated_length": 814.6035766601562, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1844.0, + "completions/mean_length": 593.525390625, + "completions/mean_terminated_length": 579.1814575195312, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.5482632073056243, - "grad_norm": 1.6136350631713867, - "kl": 6.1640625, - "learning_rate": 5.529485726882193e-07, - "loss": 0.3918, - "num_tokens": 896154426.0, - "reward": 1.84619140625, - "reward_std": 0.5813133716583252, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.19964765012264252, + "grad_norm": 2.9822580814361572, + "kl": 4.0625, + "learning_rate": 5.532178406204895e-07, + "loss": 0.2773, + "num_tokens": 944262293.0, + "reward": 1.1015625, + "reward_std": 0.2619627118110657, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.935546875, + "rewards/tag_count_reward/std": 0.1734480857849121, "step": 1606 }, { @@ -46589,27 +46589,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 849.88671875, - "completions/mean_terminated_length": 798.6436157226562, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 1924.0, + "completions/mean_length": 626.947265625, + "completions/mean_terminated_length": 624.1663208007812, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, "epoch": 0.5486045916190151, - "grad_norm": 0.8744932413101196, - "kl": 7.328125, - "learning_rate": 5.524124742698728e-07, - "loss": 0.4574, - "num_tokens": 896660128.0, - "reward": 1.77197265625, - "reward_std": 0.5420060157775879, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.90869140625, - "rewards/tag_count_reward/std": 0.20887276530265808, + "grad_norm": 1.8393330574035645, + "kl": 2.400390625, + "learning_rate": 5.526815408333023e-07, + "loss": 0.1412, + "num_tokens": 944653850.0, + "reward": 0.990234375, + "reward_std": 0.17443135380744934, + "rewards/accuracy_reward/mean": 0.029296875, + "rewards/accuracy_reward/std": 0.16880230605602264, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.14200608432292938, "step": 1607 }, { @@ -46618,27 +46618,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1942.0, - "completions/mean_length": 869.92578125, - "completions/mean_terminated_length": 831.92333984375, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 1556.0, + "completions/mean_length": 633.533203125, + "completions/mean_terminated_length": 630.76513671875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.5489459759324059, - "grad_norm": 1.3320586681365967, - "kl": 5.09765625, - "learning_rate": 5.518763724274602e-07, - "loss": 0.305, - "num_tokens": 897178330.0, - "reward": 1.83740234375, - "reward_std": 0.5293657779693604, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.17906923592090607, + "grad_norm": 3.6546730995178223, + "kl": 2.828125, + "learning_rate": 5.521452372372701e-07, + "loss": 0.1662, + "num_tokens": 945051019.0, + "reward": 1.02978515625, + "reward_std": 0.26207971572875977, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.1675231009721756, "step": 1608 }, { @@ -46647,27 +46647,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 916.75390625, - "completions/mean_terminated_length": 856.2345581054688, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 677.3203125, + "completions/mean_terminated_length": 669.24169921875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.5492873602457967, - "grad_norm": 0.7662155032157898, - "kl": 7.0859375, - "learning_rate": 5.513402679218801e-07, - "loss": 0.4624, - "num_tokens": 897725612.0, - "reward": 1.80908203125, - "reward_std": 0.5761713981628418, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.21145978569984436, + "grad_norm": 3.4890480041503906, + "kl": 2.48046875, + "learning_rate": 5.516089305941553e-07, + "loss": 0.1693, + "num_tokens": 945475711.0, + "reward": 1.04443359375, + "reward_std": 0.24629496037960052, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.1641349196434021, "step": 1609 }, { @@ -46676,27 +46676,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 915.40625, - "completions/mean_terminated_length": 869.3658447265625, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1539.0, + "completions/max_terminated_length": 1539.0, + "completions/mean_length": 681.67578125, + "completions/mean_terminated_length": 681.67578125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.5496287445591875, - "grad_norm": 1.8923580646514893, - "kl": 6.90625, - "learning_rate": 5.50804161514035e-07, - "loss": 0.4278, - "num_tokens": 898269036.0, - "reward": 1.818359375, - "reward_std": 0.5410223603248596, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.912109375, - "rewards/tag_count_reward/std": 0.20711380243301392, + "grad_norm": 2.4688189029693604, + "kl": 2.798828125, + "learning_rate": 5.510726216657251e-07, + "loss": 0.1415, + "num_tokens": 945899465.0, + "reward": 1.0234375, + "reward_std": 0.23604212701320648, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.15868334472179413, "step": 1610 }, { @@ -46705,27 +46705,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 884.74609375, - "completions/mean_terminated_length": 830.03271484375, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_terminated_length": 1789.0, + "completions/mean_length": 678.822265625, + "completions/mean_terminated_length": 673.4530029296875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.5499701288725783, - "grad_norm": 0.9083179831504822, - "kl": 6.171875, - "learning_rate": 5.502680539648296e-07, - "loss": 0.3791, - "num_tokens": 898798426.0, - "reward": 1.794921875, - "reward_std": 0.4967016875743866, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.1948670893907547, + "grad_norm": 2.9937500953674316, + "kl": 3.5, + "learning_rate": 5.505363112137493e-07, + "loss": 0.2034, + "num_tokens": 946323422.0, + "reward": 0.98193359375, + "reward_std": 0.19657066464424133, + "rewards/accuracy_reward/mean": 0.02734375, + "rewards/accuracy_reward/std": 0.16324250400066376, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.95068359375, + "rewards/tag_count_reward/std": 0.14881962537765503, "step": 1611 }, { @@ -46734,27 +46734,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 807.29296875, - "completions/mean_terminated_length": 751.5877075195312, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 1707.0, + "completions/mean_length": 618.55859375, + "completions/mean_terminated_length": 615.76123046875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.5503115131859692, - "grad_norm": 1.611915946006775, - "kl": 8.203125, - "learning_rate": 5.497319460351706e-07, - "loss": 0.521, - "num_tokens": 899285264.0, - "reward": 1.80908203125, - "reward_std": 0.6097793579101562, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.21608169376850128, + "grad_norm": 2.427215576171875, + "kl": 3.7734375, + "learning_rate": 5.5e-07, + "loss": 0.2059, + "num_tokens": 946713628.0, + "reward": 1.037109375, + "reward_std": 0.2587704658508301, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.17133069038391113, "step": 1612 }, { @@ -46763,27 +46763,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 869.275390625, - "completions/mean_terminated_length": 816.35302734375, - "completions/min_length": 84.0, - "completions/min_terminated_length": 84.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 649.265625, + "completions/mean_terminated_length": 643.7804565429688, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, "epoch": 0.5506528974993599, - "grad_norm": 1.7584127187728882, - "kl": 8.2578125, - "learning_rate": 5.491958384859652e-07, - "loss": 0.518, - "num_tokens": 899805005.0, - "reward": 1.80078125, - "reward_std": 0.5637913942337036, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.20910651981830597, + "grad_norm": 2.0092036724090576, + "kl": 3.306640625, + "learning_rate": 5.494636887862507e-07, + "loss": 0.1843, + "num_tokens": 947120724.0, + "reward": 1.033203125, + "reward_std": 0.20708541572093964, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.947265625, + "rewards/tag_count_reward/std": 0.1607295721769333, "step": 1613 }, { @@ -46792,27 +46792,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 876.923828125, - "completions/mean_terminated_length": 824.3448486328125, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 712.0390625, + "completions/mean_terminated_length": 709.4246826171875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.5509942818127507, - "grad_norm": 2.293674945831299, - "kl": 7.4453125, - "learning_rate": 5.486597320781199e-07, - "loss": 0.4283, - "num_tokens": 900330390.0, - "reward": 1.80810546875, - "reward_std": 0.5948246717453003, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.2054874747991562, + "grad_norm": 4.856078624725342, + "kl": 3.12890625, + "learning_rate": 5.489273783342749e-07, + "loss": 0.1278, + "num_tokens": 947561688.0, + "reward": 1.09326171875, + "reward_std": 0.2850106656551361, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.94677734375, + "rewards/tag_count_reward/std": 0.16321250796318054, "step": 1614 }, { @@ -46821,27 +46821,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.072265625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1878.0, - "completions/mean_length": 945.734375, - "completions/mean_terminated_length": 859.8736572265625, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 705.41796875, + "completions/mean_terminated_length": 700.1529541015625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.5513356661261415, - "grad_norm": 5.122795581817627, - "kl": 10.8125, - "learning_rate": 5.481236275725398e-07, - "loss": 0.6246, - "num_tokens": 900889022.0, - "reward": 1.654296875, - "reward_std": 0.6732208132743835, - "rewards/accuracy_reward/mean": 0.03629032149910927, - "rewards/accuracy_reward/std": 0.1872003972530365, - "rewards/format_reward/mean": 0.751953125, - "rewards/format_reward/std": 0.4323015511035919, - "rewards/tag_count_reward/mean": 0.8671875, - "rewards/tag_count_reward/std": 0.2496328055858612, + "grad_norm": 5.00110387802124, + "kl": 3.640625, + "learning_rate": 5.483910694058445e-07, + "loss": 0.2306, + "num_tokens": 947997278.0, + "reward": 0.98583984375, + "reward_std": 0.22858084738254547, + "rewards/accuracy_reward/mean": 0.04838709533214569, + "rewards/accuracy_reward/std": 0.21479946374893188, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.17631596326828003, "step": 1615 }, { @@ -46850,27 +46850,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.07421875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1953.0, - "completions/mean_length": 970.21875, - "completions/mean_terminated_length": 883.8143310546875, - "completions/min_length": 65.0, - "completions/min_terminated_length": 65.0, + "completions/max_terminated_length": 1712.0, + "completions/mean_length": 776.853515625, + "completions/mean_terminated_length": 771.86865234375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.5516770504395323, - "grad_norm": 6.714582920074463, - "kl": 9.9140625, - "learning_rate": 5.475875257301274e-07, - "loss": 0.5375, - "num_tokens": 901460302.0, - "reward": 1.6875, - "reward_std": 0.6485152244567871, - "rewards/accuracy_reward/mean": 0.06653226166963577, - "rewards/accuracy_reward/std": 0.24946178495883942, - "rewards/format_reward/mean": 0.75390625, - "rewards/format_reward/std": 0.4311550557613373, - "rewards/tag_count_reward/mean": 0.869140625, - "rewards/tag_count_reward/std": 0.2511516213417053, + "grad_norm": 2.439131498336792, + "kl": 2.576171875, + "learning_rate": 5.4785476276273e-07, + "loss": 0.1324, + "num_tokens": 948469555.0, + "reward": 1.02099609375, + "reward_std": 0.218895822763443, + "rewards/accuracy_reward/mean": 0.08870967477560043, + "rewards/accuracy_reward/std": 0.2846112847328186, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.17631596326828003, "step": 1616 }, { @@ -46879,27 +46879,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1884.0, - "completions/mean_length": 868.125, - "completions/mean_terminated_length": 805.0040893554688, - "completions/min_length": 76.0, - "completions/min_terminated_length": 76.0, + "completions/max_terminated_length": 1466.0, + "completions/mean_length": 709.802734375, + "completions/mean_terminated_length": 688.5615234375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, "epoch": 0.5520184347529231, - "grad_norm": 1.7493685483932495, - "kl": 6.3671875, - "learning_rate": 5.470514273117807e-07, - "loss": 0.3642, - "num_tokens": 901984286.0, - "reward": 1.732421875, - "reward_std": 0.5994226932525635, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.8984375, - "rewards/tag_count_reward/std": 0.21430610120296478, + "grad_norm": 1.7288684844970703, + "kl": 4.1171875, + "learning_rate": 5.473184591666978e-07, + "loss": 0.2585, + "num_tokens": 948912478.0, + "reward": 0.9970703125, + "reward_std": 0.2309703379869461, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.16363714635372162, "step": 1617 }, { @@ -46908,27 +46908,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1983.0, - "completions/mean_length": 786.9453125, - "completions/mean_terminated_length": 754.0922241210938, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 1940.0, + "completions/mean_length": 665.990234375, + "completions/mean_terminated_length": 663.2857055664062, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.5523598190663139, - "grad_norm": 3.0826311111450195, - "kl": 4.93359375, - "learning_rate": 5.465153330783939e-07, - "loss": 0.3272, - "num_tokens": 902468178.0, - "reward": 1.8212890625, - "reward_std": 0.49712318181991577, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.18952499330043793, + "grad_norm": 3.8421480655670166, + "kl": 3.13671875, + "learning_rate": 5.467821593795105e-07, + "loss": 0.1676, + "num_tokens": 949334441.0, + "reward": 1.00732421875, + "reward_std": 0.2161152958869934, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.94677734375, + "rewards/tag_count_reward/std": 0.15710307657718658, "step": 1618 }, { @@ -46937,27 +46937,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1954.0, - "completions/mean_length": 926.4609375, - "completions/mean_terminated_length": 851.6917114257812, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 1655.0, + "completions/mean_length": 758.25, + "completions/mean_terminated_length": 742.95654296875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.5527012033797047, - "grad_norm": 1.520570158958435, - "kl": 7.015625, - "learning_rate": 5.459792437908542e-07, - "loss": 0.4051, - "num_tokens": 903015198.0, - "reward": 1.72900390625, - "reward_std": 0.6388310194015503, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.763671875, - "rewards/format_reward/std": 0.42524150013923645, - "rewards/tag_count_reward/mean": 0.87939453125, - "rewards/tag_count_reward/std": 0.23560354113578796, + "grad_norm": 2.9555060863494873, + "kl": 3.32421875, + "learning_rate": 5.462458641629253e-07, + "loss": 0.1783, + "num_tokens": 949795337.0, + "reward": 1.03564453125, + "reward_std": 0.27600812911987305, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.1792825609445572, "step": 1619 }, { @@ -46966,27 +46966,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 866.72265625, - "completions/mean_terminated_length": 816.1996459960938, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/max_terminated_length": 1561.0, + "completions/mean_length": 714.625, + "completions/mean_terminated_length": 704.1259765625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, "epoch": 0.5530425876930956, - "grad_norm": 1.5511140823364258, - "kl": 5.953125, - "learning_rate": 5.454431602100425e-07, - "loss": 0.4021, - "num_tokens": 903532928.0, - "reward": 1.75, - "reward_std": 0.5770016312599182, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.798828125, - "rewards/format_reward/std": 0.4012683033943176, - "rewards/tag_count_reward/mean": 0.896484375, - "rewards/tag_count_reward/std": 0.21508051455020905, + "grad_norm": 4.922884464263916, + "kl": 3.1328125, + "learning_rate": 5.457095742786929e-07, + "loss": 0.1903, + "num_tokens": 950235193.0, + "reward": 1.064453125, + "reward_std": 0.24485132098197937, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.15868334472179413, "step": 1620 }, { @@ -46995,27 +46995,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1964.0, - "completions/mean_length": 859.68359375, - "completions/mean_terminated_length": 831.1640625, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 727.765625, + "completions/mean_terminated_length": 722.5882568359375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.5533839720064863, - "grad_norm": 4.295713424682617, - "kl": 4.50390625, - "learning_rate": 5.449070830968316e-07, - "loss": 0.3205, - "num_tokens": 904046606.0, - "reward": 1.7939453125, - "reward_std": 0.5318611264228821, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.1909715086221695, + "grad_norm": 3.3051609992980957, + "kl": 3.1171875, + "learning_rate": 5.45173290488556e-07, + "loss": 0.1704, + "num_tokens": 950681329.0, + "reward": 1.01123046875, + "reward_std": 0.22558684647083282, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.1541435420513153, "step": 1621 }, { @@ -47024,27 +47024,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 860.353515625, - "completions/mean_terminated_length": 801.9446411132812, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/max_terminated_length": 1540.0, + "completions/mean_length": 700.935546875, + "completions/mean_terminated_length": 695.6529541015625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, "epoch": 0.5537253563198771, - "grad_norm": 1.6234852075576782, - "kl": 5.609375, - "learning_rate": 5.443710132120846e-07, - "loss": 0.3364, - "num_tokens": 904563795.0, - "reward": 1.79345703125, - "reward_std": 0.6144713163375854, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.89892578125, - "rewards/tag_count_reward/std": 0.22099627554416656, + "grad_norm": 1.9526646137237549, + "kl": 3.529296875, + "learning_rate": 5.446370135542494e-07, + "loss": 0.1913, + "num_tokens": 951116896.0, + "reward": 1.037109375, + "reward_std": 0.26890334486961365, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.19246920943260193, "step": 1622 }, { @@ -47053,27 +47053,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 910.89453125, - "completions/mean_terminated_length": 837.609130859375, - "completions/min_length": 24.0, - "completions/min_terminated_length": 24.0, + "completions/max_terminated_length": 1512.0, + "completions/mean_length": 707.09765625, + "completions/mean_terminated_length": 704.4735717773438, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.5540667406332679, - "grad_norm": 1.9639532566070557, - "kl": 6.671875, - "learning_rate": 5.438349513166549e-07, - "loss": 0.4267, - "num_tokens": 905106445.0, - "reward": 1.70458984375, - "reward_std": 0.66635662317276, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.24230584502220154, - "rewards/format_reward/mean": 0.767578125, - "rewards/format_reward/std": 0.42278963327407837, - "rewards/tag_count_reward/mean": 0.87646484375, - "rewards/tag_count_reward/std": 0.24380344152450562, + "grad_norm": 2.6690735816955566, + "kl": 2.529296875, + "learning_rate": 5.441007442374973e-07, + "loss": 0.1136, + "num_tokens": 951555202.0, + "reward": 1.080078125, + "reward_std": 0.2462228238582611, + "rewards/accuracy_reward/mean": 0.11895161122083664, + "rewards/accuracy_reward/std": 0.3240584135055542, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.16026519238948822, "step": 1623 }, { @@ -47082,27 +47082,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1860.0, - "completions/mean_length": 870.599609375, - "completions/mean_terminated_length": 794.71728515625, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 1776.0, + "completions/mean_length": 720.267578125, + "completions/mean_terminated_length": 707.173583984375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, "epoch": 0.5544081249466587, - "grad_norm": 1.7202931642532349, - "kl": 6.22265625, - "learning_rate": 5.432988981713842e-07, - "loss": 0.4141, - "num_tokens": 905624784.0, - "reward": 1.70654296875, - "reward_std": 0.6034641265869141, - "rewards/accuracy_reward/mean": 0.021484375, - "rewards/accuracy_reward/std": 0.14513419568538666, - "rewards/format_reward/mean": 0.787109375, - "rewards/format_reward/std": 0.409751296043396, - "rewards/tag_count_reward/mean": 0.89794921875, - "rewards/tag_count_reward/std": 0.21606400609016418, + "grad_norm": 4.445178985595703, + "kl": 3.09765625, + "learning_rate": 5.435644833000138e-07, + "loss": 0.188, + "num_tokens": 951996571.0, + "reward": 0.984375, + "reward_std": 0.19075095653533936, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.17411890625953674, "step": 1624 }, { @@ -47111,27 +47111,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.068359375, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 850.359375, - "completions/mean_terminated_length": 762.482177734375, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 1780.0, + "completions/mean_length": 712.953125, + "completions/mean_terminated_length": 702.44091796875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.5547495092600495, - "grad_norm": 1.852662205696106, - "kl": 8.84375, - "learning_rate": 5.42762854537102e-07, - "loss": 0.5272, - "num_tokens": 906131304.0, - "reward": 1.78466796875, - "reward_std": 0.6673040390014648, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, - "rewards/format_reward/mean": 0.783203125, - "rewards/format_reward/std": 0.4124660789966583, - "rewards/tag_count_reward/mean": 0.88232421875, - "rewards/tag_count_reward/std": 0.23811282217502594, + "grad_norm": 2.121750831604004, + "kl": 2.533203125, + "learning_rate": 5.430282315035007e-07, + "loss": 0.139, + "num_tokens": 952432739.0, + "reward": 1.09033203125, + "reward_std": 0.2788330018520355, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.166073739528656, "step": 1625 }, { @@ -47140,27 +47140,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.072265625, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 900.119140625, - "completions/mean_terminated_length": 810.7052001953125, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 1901.0, + "completions/mean_length": 731.849609375, + "completions/mean_terminated_length": 721.4862060546875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.5550908935734403, - "grad_norm": 4.000398635864258, - "kl": 9.5625, - "learning_rate": 5.422268211746239e-07, - "loss": 0.5557, - "num_tokens": 906667013.0, - "reward": 1.69775390625, - "reward_std": 0.6580989956855774, - "rewards/accuracy_reward/mean": 0.08266129344701767, - "rewards/accuracy_reward/std": 0.2756475806236267, - "rewards/format_reward/mean": 0.74609375, - "rewards/format_reward/std": 0.43567025661468506, - "rewards/tag_count_reward/mean": 0.87158203125, - "rewards/tag_count_reward/std": 0.24176867306232452, + "grad_norm": 3.8825016021728516, + "kl": 3.55859375, + "learning_rate": 5.42491989609647e-07, + "loss": 0.2577, + "num_tokens": 952882294.0, + "reward": 1.1015625, + "reward_std": 0.3052609860897064, + "rewards/accuracy_reward/mean": 0.16532258689403534, + "rewards/accuracy_reward/std": 0.37184643745422363, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.17476527392864227, "step": 1626 }, { @@ -47169,27 +47169,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.08984375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 946.92578125, - "completions/mean_terminated_length": 838.2360229492188, - "completions/min_length": 2.0, - "completions/min_terminated_length": 2.0, + "completions/max_terminated_length": 1768.0, + "completions/mean_length": 745.51171875, + "completions/mean_terminated_length": 732.6666870117188, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.5554322778868311, - "grad_norm": 2.9227447509765625, - "kl": 9.234375, - "learning_rate": 5.416907988447514e-07, - "loss": 0.5736, - "num_tokens": 907233887.0, - "reward": 1.6416015625, - "reward_std": 0.6145917177200317, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.728515625, - "rewards/format_reward/std": 0.44516023993492126, - "rewards/tag_count_reward/mean": 0.8623046875, - "rewards/tag_count_reward/std": 0.25138550996780396, + "grad_norm": 6.176815509796143, + "kl": 2.40625, + "learning_rate": 5.419557583801274e-07, + "loss": 0.1573, + "num_tokens": 953346044.0, + "reward": 1.01416015625, + "reward_std": 0.24248671531677246, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.1793731451034546, "step": 1627 }, { @@ -47198,27 +47198,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 846.671875, - "completions/mean_terminated_length": 795.291259765625, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 714.87109375, + "completions/mean_terminated_length": 709.6431884765625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.555773662200222, - "grad_norm": 2.2842178344726562, - "kl": 7.78125, - "learning_rate": 5.411547883082701e-07, - "loss": 0.4494, - "num_tokens": 907753655.0, - "reward": 1.73486328125, - "reward_std": 0.641516387462616, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.76953125, - "rewards/format_reward/std": 0.42154473066329956, - "rewards/tag_count_reward/mean": 0.88720703125, - "rewards/tag_count_reward/std": 0.22953926026821136, + "grad_norm": 4.175749778747559, + "kl": 2.8125, + "learning_rate": 5.41419538576602e-07, + "loss": 0.1744, + "num_tokens": 953798330.0, + "reward": 1.072265625, + "reward_std": 0.2861534357070923, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.935546875, + "rewards/tag_count_reward/std": 0.1783159226179123, "step": 1628 }, { @@ -47227,27 +47227,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 849.826171875, - "completions/mean_terminated_length": 769.9479370117188, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1708.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 678.978515625, + "completions/mean_terminated_length": 678.978515625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.5561150465136127, - "grad_norm": 2.6979892253875732, - "kl": 7.3671875, - "learning_rate": 5.406187903259491e-07, - "loss": 0.4321, - "num_tokens": 908262782.0, - "reward": 1.73681640625, - "reward_std": 0.6400578022003174, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.7734375, - "rewards/format_reward/std": 0.4190165400505066, - "rewards/tag_count_reward/mean": 0.87548828125, - "rewards/tag_count_reward/std": 0.24630290269851685, + "grad_norm": 1.7773770093917847, + "kl": 2.26171875, + "learning_rate": 5.408833309607137e-07, + "loss": 0.1096, + "num_tokens": 954219983.0, + "reward": 1.10595703125, + "reward_std": 0.27551087737083435, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.95166015625, + "rewards/tag_count_reward/std": 0.14665940403938293, "step": 1629 }, { @@ -47256,27 +47256,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 811.080078125, - "completions/mean_terminated_length": 758.17724609375, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 1474.0, + "completions/mean_length": 677.689453125, + "completions/mean_terminated_length": 672.3157348632812, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.5564564308270035, - "grad_norm": 4.622169017791748, - "kl": 5.68359375, - "learning_rate": 5.400828056585394e-07, - "loss": 0.4133, - "num_tokens": 908749751.0, - "reward": 1.83154296875, - "reward_std": 0.5589828491210938, - "rewards/accuracy_reward/mean": 0.07056451588869095, - "rewards/accuracy_reward/std": 0.25635457038879395, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19761274755001068, + "grad_norm": 2.833087921142578, + "kl": 3.7265625, + "learning_rate": 5.403471362940891e-07, + "loss": 0.1979, + "num_tokens": 954638656.0, + "reward": 1.0478515625, + "reward_std": 0.2628737688064575, + "rewards/accuracy_reward/mean": 0.11491935700178146, + "rewards/accuracy_reward/std": 0.3192465901374817, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.1812576949596405, "step": 1630 }, { @@ -47285,27 +47285,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1956.0, - "completions/mean_length": 937.03125, - "completions/mean_terminated_length": 880.0000610351562, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 1783.0, + "completions/mean_length": 768.42578125, + "completions/mean_terminated_length": 765.9216918945312, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.5567978151403943, - "grad_norm": 3.613112211227417, - "kl": 6.390625, - "learning_rate": 5.395468350667732e-07, - "loss": 0.4359, - "num_tokens": 909317703.0, - "reward": 1.712890625, - "reward_std": 0.5855178833007812, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.791015625, - "rewards/format_reward/std": 0.40698084235191345, - "rewards/tag_count_reward/mean": 0.892578125, - "rewards/tag_count_reward/std": 0.22542326152324677, + "grad_norm": 2.5279579162597656, + "kl": 3.19140625, + "learning_rate": 5.398109553383359e-07, + "loss": 0.1445, + "num_tokens": 955120282.0, + "reward": 0.9912109375, + "reward_std": 0.23996403813362122, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.1743793785572052, "step": 1631 }, { @@ -47314,27 +47314,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 870.28515625, - "completions/mean_terminated_length": 812.3646850585938, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1864.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 704.80859375, + "completions/mean_terminated_length": 704.80859375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, "epoch": 0.5571391994537851, - "grad_norm": 2.058533191680908, - "kl": 5.6484375, - "learning_rate": 5.390108793113628e-07, - "loss": 0.3515, - "num_tokens": 909843545.0, - "reward": 1.8017578125, - "reward_std": 0.5373117327690125, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.2055833637714386, + "grad_norm": 3.8299880027770996, + "kl": 2.84375, + "learning_rate": 5.392747888550423e-07, + "loss": 0.1718, + "num_tokens": 955561400.0, + "reward": 1.00830078125, + "reward_std": 0.2315167635679245, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.94384765625, + "rewards/tag_count_reward/std": 0.16147024929523468, "step": 1632 }, { @@ -47343,27 +47343,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 847.859375, - "completions/mean_terminated_length": 796.529541015625, - "completions/min_length": 239.0, - "completions/min_terminated_length": 239.0, + "completions/max_terminated_length": 1598.0, + "completions/mean_length": 698.076171875, + "completions/mean_terminated_length": 695.4344482421875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, "epoch": 0.5574805837671759, - "grad_norm": 4.070928573608398, - "kl": 4.33984375, - "learning_rate": 5.384749391529991e-07, - "loss": 0.2863, - "num_tokens": 910348753.0, - "reward": 1.833984375, - "reward_std": 0.48867836594581604, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.1907537281513214, + "grad_norm": 3.1295650005340576, + "kl": 2.958984375, + "learning_rate": 5.387386376057759e-07, + "loss": 0.1481, + "num_tokens": 955989919.0, + "reward": 1.03564453125, + "reward_std": 0.2552622854709625, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.1696992814540863, "step": 1633 }, { @@ -47372,27 +47372,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 812.2734375, - "completions/mean_terminated_length": 743.48046875, - "completions/min_length": 93.0, - "completions/min_terminated_length": 93.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1809.0, + "completions/max_terminated_length": 1809.0, + "completions/mean_length": 646.27734375, + "completions/mean_terminated_length": 646.27734375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, "epoch": 0.5578219680805667, - "grad_norm": 1.5464138984680176, - "kl": 5.5234375, - "learning_rate": 5.379390153523515e-07, - "loss": 0.3656, - "num_tokens": 910844829.0, - "reward": 1.86328125, - "reward_std": 0.5772223472595215, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.20715993642807007, + "grad_norm": 1.8711708784103394, + "kl": 2.4921875, + "learning_rate": 5.382025023520835e-07, + "loss": 0.1455, + "num_tokens": 956401005.0, + "reward": 1.12255859375, + "reward_std": 0.2865196466445923, + "rewards/accuracy_reward/mean": 0.173828125, + "rewards/accuracy_reward/std": 0.3793322443962097, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.94677734375, + "rewards/tag_count_reward/std": 0.15787968039512634, "step": 1634 }, { @@ -47401,27 +47401,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 811.873046875, - "completions/mean_terminated_length": 792.2520141601562, - "completions/min_length": 193.0, - "completions/min_terminated_length": 193.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1637.0, + "completions/max_terminated_length": 1637.0, + "completions/mean_length": 710.873046875, + "completions/mean_terminated_length": 710.873046875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.5581633523939575, - "grad_norm": 2.051917552947998, - "kl": 3.96484375, - "learning_rate": 5.374031086700654e-07, - "loss": 0.2247, - "num_tokens": 911337500.0, - "reward": 1.8916015625, - "reward_std": 0.4719935953617096, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.9443359375, - "rewards/tag_count_reward/std": 0.16794821619987488, + "grad_norm": 2.4275922775268555, + "kl": 2.701171875, + "learning_rate": 5.376663838554878e-07, + "loss": 0.144, + "num_tokens": 956841964.0, + "reward": 1.0166015625, + "reward_std": 0.2830686569213867, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.17378656566143036, "step": 1635 }, { @@ -47430,27 +47430,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1928.0, - "completions/mean_length": 869.28515625, - "completions/mean_terminated_length": 828.8040771484375, - "completions/min_length": 208.0, - "completions/min_terminated_length": 208.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 717.259765625, + "completions/mean_terminated_length": 712.0411987304688, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.5585047367073483, - "grad_norm": 1.0205174684524536, - "kl": 4.921875, - "learning_rate": 5.368672198667627e-07, - "loss": 0.2909, - "num_tokens": 911863870.0, - "reward": 1.84375, - "reward_std": 0.5353987812995911, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.2021169811487198, + "grad_norm": 3.242382526397705, + "kl": 3.2734375, + "learning_rate": 5.371302828774886e-07, + "loss": 0.1992, + "num_tokens": 957290497.0, + "reward": 1.00341796875, + "reward_std": 0.2538776397705078, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.18294404447078705, "step": 1636 }, { @@ -47459,27 +47459,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 855.365234375, - "completions/mean_terminated_length": 814.4060668945312, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1609.0, + "completions/max_terminated_length": 1609.0, + "completions/mean_length": 755.041015625, + "completions/mean_terminated_length": 755.041015625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.558846121020739, - "grad_norm": 1.695725679397583, - "kl": 5.53125, - "learning_rate": 5.363313497030395e-07, - "loss": 0.3405, - "num_tokens": 912388569.0, - "reward": 1.8056640625, - "reward_std": 0.5462358593940735, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.1939898282289505, + "grad_norm": 2.9401183128356934, + "kl": 2.6875, + "learning_rate": 5.365942001795606e-07, + "loss": 0.144, + "num_tokens": 957763830.0, + "reward": 1.013671875, + "reward_std": 0.23237158358097076, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.16772332787513733, "step": 1637 }, { @@ -47488,27 +47488,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1886.0, - "completions/mean_length": 819.974609375, - "completions/mean_terminated_length": 780.36083984375, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/max_terminated_length": 1787.0, + "completions/mean_length": 725.423828125, + "completions/mean_terminated_length": 712.3806762695312, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.5591875053341299, - "grad_norm": 1.639160394668579, - "kl": 5.8046875, - "learning_rate": 5.357954989394651e-07, - "loss": 0.3393, - "num_tokens": 912889676.0, - "reward": 1.833984375, - "reward_std": 0.4923778772354126, - "rewards/accuracy_reward/mean": 0.04233871027827263, - "rewards/accuracy_reward/std": 0.2015640139579773, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.18036192655563354, + "grad_norm": 1.9118342399597168, + "kl": 4.64453125, + "learning_rate": 5.360581365231528e-07, + "loss": 0.3103, + "num_tokens": 958216527.0, + "reward": 0.9951171875, + "reward_std": 0.25297701358795166, + "rewards/accuracy_reward/mean": 0.08266129344701767, + "rewards/accuracy_reward/std": 0.2756475806236267, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.2019064873456955, "step": 1638 }, { @@ -47517,27 +47517,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 792.453125, - "completions/mean_terminated_length": 733.3987426757812, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1772.0, + "completions/max_terminated_length": 1772.0, + "completions/mean_length": 625.45703125, + "completions/mean_terminated_length": 625.45703125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, "epoch": 0.5595288896475207, - "grad_norm": 4.835798740386963, - "kl": 8.6484375, - "learning_rate": 5.35259668336582e-07, - "loss": 0.4843, - "num_tokens": 913363940.0, - "reward": 1.8251953125, - "reward_std": 0.6062630414962769, - "rewards/accuracy_reward/mean": 0.1088709682226181, - "rewards/accuracy_reward/std": 0.31179171800613403, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.20761838555335999, + "grad_norm": 4.299546718597412, + "kl": 3.34375, + "learning_rate": 5.355220926696863e-07, + "loss": 0.1656, + "num_tokens": 958605289.0, + "reward": 1.0869140625, + "reward_std": 0.32027533650398254, + "rewards/accuracy_reward/mean": 0.1391129046678543, + "rewards/accuracy_reward/std": 0.3464137017726898, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.17729215323925018, "step": 1639 }, { @@ -47546,27 +47546,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 893.580078125, - "completions/mean_terminated_length": 839.2821655273438, - "completions/min_length": 65.0, - "completions/min_terminated_length": 65.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1503.0, + "completions/max_terminated_length": 1503.0, + "completions/mean_length": 733.197265625, + "completions/mean_terminated_length": 733.197265625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.5598702739609115, - "grad_norm": 2.5028481483459473, - "kl": 7.515625, - "learning_rate": 5.347238586549036e-07, - "loss": 0.4639, - "num_tokens": 913895389.0, - "reward": 1.7939453125, - "reward_std": 0.533664345741272, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.20379072427749634, + "grad_norm": 2.5128886699676514, + "kl": 2.6484375, + "learning_rate": 5.349860693805552e-07, + "loss": 0.144, + "num_tokens": 959054622.0, + "reward": 1.01806640625, + "reward_std": 0.2745065689086914, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.16533562541007996, "step": 1640 }, { @@ -47575,27 +47575,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 862.306640625, - "completions/mean_terminated_length": 824.0584716796875, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1891.0, + "completions/max_terminated_length": 1891.0, + "completions/mean_length": 725.36328125, + "completions/mean_terminated_length": 725.36328125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, "epoch": 0.5602116582743023, - "grad_norm": 2.6302478313446045, - "kl": 7.7109375, - "learning_rate": 5.341880706549138e-07, - "loss": 0.4584, - "num_tokens": 914415514.0, - "reward": 1.85498046875, - "reward_std": 0.5276713967323303, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.2014581561088562, + "grad_norm": 2.316086530685425, + "kl": 2.5546875, + "learning_rate": 5.344500674171237e-07, + "loss": 0.1181, + "num_tokens": 959504632.0, + "reward": 1.0673828125, + "reward_std": 0.30224400758743286, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.16953378915786743, "step": 1641 }, { @@ -47604,27 +47604,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 892.8046875, - "completions/mean_terminated_length": 815.7916870117188, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/max_terminated_length": 1787.0, + "completions/mean_length": 732.130859375, + "completions/mean_terminated_length": 724.375244140625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.5605530425876931, - "grad_norm": 2.9713118076324463, - "kl": 10.234375, - "learning_rate": 5.336523050970657e-07, - "loss": 0.6381, - "num_tokens": 914950086.0, - "reward": 1.7939453125, - "reward_std": 0.6171619892120361, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.8994140625, - "rewards/tag_count_reward/std": 0.23385153710842133, + "grad_norm": 2.683919906616211, + "kl": 3.421875, + "learning_rate": 5.339140875407257e-07, + "loss": 0.2086, + "num_tokens": 959956939.0, + "reward": 1.03955078125, + "reward_std": 0.25402867794036865, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.18388700485229492, "step": 1642 }, { @@ -47633,27 +47633,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 819.419921875, - "completions/mean_terminated_length": 764.2591552734375, - "completions/min_length": 85.0, - "completions/min_terminated_length": 85.0, + "completions/max_terminated_length": 1428.0, + "completions/mean_length": 684.48828125, + "completions/mean_terminated_length": 681.8199462890625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.5608944269010839, - "grad_norm": 1.6978282928466797, - "kl": 7.5625, - "learning_rate": 5.331165627417807e-07, - "loss": 0.4526, - "num_tokens": 915449837.0, - "reward": 1.82568359375, - "reward_std": 0.5862592458724976, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.21346116065979004, + "grad_norm": 1.6999034881591797, + "kl": 2.9921875, + "learning_rate": 5.33378130512664e-07, + "loss": 0.1461, + "num_tokens": 960387605.0, + "reward": 1.0302734375, + "reward_std": 0.2597278654575348, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.17696848511695862, "step": 1643 }, { @@ -47662,27 +47662,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 912.802734375, - "completions/mean_terminated_length": 847.130126953125, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1784.0, + "completions/max_terminated_length": 1784.0, + "completions/mean_length": 734.4375, + "completions/mean_terminated_length": 734.4375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.5612358112144747, - "grad_norm": 2.8521158695220947, - "kl": 8.171875, - "learning_rate": 5.325808443494467e-07, - "loss": 0.471, - "num_tokens": 915999864.0, - "reward": 1.7509765625, - "reward_std": 0.5721356272697449, - "rewards/accuracy_reward/mean": 0.01953125, - "rewards/accuracy_reward/std": 0.1385180652141571, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.21826240420341492, + "grad_norm": 4.630004405975342, + "kl": 3.47265625, + "learning_rate": 5.328421970942091e-07, + "loss": 0.1936, + "num_tokens": 960846309.0, + "reward": 0.98193359375, + "reward_std": 0.2521049678325653, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.17793437838554382, "step": 1644 }, { @@ -47691,27 +47691,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1944.0, - "completions/mean_length": 808.5234375, - "completions/mean_terminated_length": 758.13818359375, - "completions/min_length": 182.0, - "completions/min_terminated_length": 182.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 675.552734375, + "completions/mean_terminated_length": 672.866943359375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.5615771955278654, - "grad_norm": 1.2435574531555176, - "kl": 7.296875, - "learning_rate": 5.32045150680418e-07, - "loss": 0.4752, - "num_tokens": 916498244.0, - "reward": 1.880859375, - "reward_std": 0.6028008460998535, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.2011692225933075, + "grad_norm": 5.781026840209961, + "kl": 3.6875, + "learning_rate": 5.323062880465972e-07, + "loss": 0.1936, + "num_tokens": 961276608.0, + "reward": 1.076171875, + "reward_std": 0.3023701608181, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.17999069392681122, "step": 1645 }, { @@ -47720,27 +47720,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 890.66796875, - "completions/mean_terminated_length": 833.7499389648438, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 1716.0, + "completions/mean_length": 700.80859375, + "completions/mean_terminated_length": 690.2008056640625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.5619185798412563, - "grad_norm": 1.9105225801467896, - "kl": 7.1484375, - "learning_rate": 5.31509482495014e-07, - "loss": 0.4276, - "num_tokens": 917024906.0, - "reward": 1.7724609375, - "reward_std": 0.5939950942993164, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.2211502492427826, + "grad_norm": 2.192715644836426, + "kl": 3.828125, + "learning_rate": 5.31770404131031e-07, + "loss": 0.2394, + "num_tokens": 961706062.0, + "reward": 0.970703125, + "reward_std": 0.26937031745910645, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.900390625, + "rewards/tag_count_reward/std": 0.20829153060913086, "step": 1646 }, { @@ -47749,27 +47749,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 933.173828125, - "completions/mean_terminated_length": 873.5328979492188, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 1717.0, + "completions/mean_length": 747.90625, + "completions/mean_terminated_length": 740.24365234375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.5622599641546471, - "grad_norm": 0.8320483565330505, - "kl": 5.8984375, - "learning_rate": 5.309738405535177e-07, - "loss": 0.3445, - "num_tokens": 917575187.0, - "reward": 1.80322265625, - "reward_std": 0.6222258806228638, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.8046875, - "rewards/format_reward/std": 0.3968288004398346, - "rewards/tag_count_reward/mean": 0.90087890625, - "rewards/tag_count_reward/std": 0.22407497465610504, + "grad_norm": 2.9099481105804443, + "kl": 3.48046875, + "learning_rate": 5.312345461086763e-07, + "loss": 0.2057, + "num_tokens": 962161486.0, + "reward": 1.01708984375, + "reward_std": 0.32222145795822144, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.20428422093391418, "step": 1647 }, { @@ -47778,27 +47778,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 890.234375, - "completions/mean_terminated_length": 843.1707153320312, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1797.0, + "completions/mean_length": 721.6796875, + "completions/mean_terminated_length": 711.2362060546875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.5626013484680379, - "grad_norm": 1.7899667024612427, - "kl": 4.73828125, - "learning_rate": 5.304382256161746e-07, - "loss": 0.3263, - "num_tokens": 918109163.0, - "reward": 1.84423828125, - "reward_std": 0.5855379700660706, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.90869140625, - "rewards/tag_count_reward/std": 0.2059241086244583, + "grad_norm": 2.831590175628662, + "kl": 4.02734375, + "learning_rate": 5.306987147406629e-07, + "loss": 0.2246, + "num_tokens": 962609162.0, + "reward": 1.0859375, + "reward_std": 0.3347628712654114, + "rewards/accuracy_reward/mean": 0.169921875, + "rewards/accuracy_reward/std": 0.3759314715862274, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.2103821337223053, "step": 1648 }, { @@ -47807,27 +47807,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1932.0, - "completions/mean_length": 914.560546875, - "completions/mean_terminated_length": 863.67138671875, - "completions/min_length": 247.0, - "completions/min_terminated_length": 247.0, + "completions/max_terminated_length": 1799.0, + "completions/mean_length": 739.5234375, + "completions/mean_terminated_length": 734.3922119140625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.5629427327814287, - "grad_norm": 4.089898109436035, - "kl": 5.7265625, - "learning_rate": 5.299026384431929e-07, - "loss": 0.4065, - "num_tokens": 918650762.0, - "reward": 1.78173828125, - "reward_std": 0.5551834106445312, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.2085477113723755, + "grad_norm": 1.9423184394836426, + "kl": 3.03125, + "learning_rate": 5.301629107880827e-07, + "loss": 0.1773, + "num_tokens": 963061142.0, + "reward": 1.01806640625, + "reward_std": 0.2861826419830322, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.19232456386089325, "step": 1649 }, { @@ -47836,27 +47836,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1854.0, - "completions/mean_length": 801.73046875, - "completions/mean_terminated_length": 764.11669921875, - "completions/min_length": 52.0, - "completions/min_terminated_length": 52.0, + "completions/max_terminated_length": 1902.0, + "completions/mean_length": 681.060546875, + "completions/mean_terminated_length": 678.385498046875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.5632841170948195, - "grad_norm": 3.8572473526000977, - "kl": 4.4921875, - "learning_rate": 5.293670797947396e-07, - "loss": 0.3147, - "num_tokens": 919134496.0, - "reward": 1.84912109375, - "reward_std": 0.549421489238739, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.19436074793338776, + "grad_norm": 1.760202169418335, + "kl": 3.46484375, + "learning_rate": 5.296271350119887e-07, + "loss": 0.2037, + "num_tokens": 963483093.0, + "reward": 1.03369140625, + "reward_std": 0.2839530110359192, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.18705546855926514, "step": 1650 }, { @@ -47865,27 +47865,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 832.83203125, - "completions/mean_terminated_length": 801.1743774414062, - "completions/min_length": 192.0, - "completions/min_terminated_length": 192.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 692.193359375, + "completions/mean_terminated_length": 681.5177001953125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.5636255014082103, - "grad_norm": 3.1188771724700928, - "kl": 3.615234375, - "learning_rate": 5.288315504309429e-07, - "loss": 0.2475, - "num_tokens": 919630778.0, - "reward": 1.88134765625, - "reward_std": 0.5292597413063049, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.17743425071239471, + "grad_norm": 2.400151014328003, + "kl": 4.48828125, + "learning_rate": 5.290913881733931e-07, + "loss": 0.2607, + "num_tokens": 963907368.0, + "reward": 1.02734375, + "reward_std": 0.3126910328865051, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.900390625, + "rewards/tag_count_reward/std": 0.2129373401403427, "step": 1651 }, { @@ -47894,27 +47894,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 829.96875, - "completions/mean_terminated_length": 803.2255249023438, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1810.0, + "completions/max_terminated_length": 1810.0, + "completions/mean_length": 727.75, + "completions/mean_terminated_length": 727.75, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.5639668857216011, - "grad_norm": 2.7055652141571045, - "kl": 4.1328125, - "learning_rate": 5.282960511118882e-07, - "loss": 0.2534, - "num_tokens": 920136122.0, - "reward": 1.81787109375, - "reward_std": 0.5305625200271606, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.1960885375738144, + "grad_norm": 1.8302973508834839, + "kl": 3.328125, + "learning_rate": 5.285556710332681e-07, + "loss": 0.2013, + "num_tokens": 964360376.0, + "reward": 1.0205078125, + "reward_std": 0.29737502336502075, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.19744645059108734, "step": 1652 }, { @@ -47923,27 +47923,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1969.0, - "completions/mean_length": 832.1015625, - "completions/mean_terminated_length": 812.8016357421875, - "completions/min_length": 250.0, - "completions/min_terminated_length": 250.0, + "completions/max_terminated_length": 1890.0, + "completions/mean_length": 714.77734375, + "completions/mean_terminated_length": 704.279541015625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, "epoch": 0.5643082700349918, - "grad_norm": 3.7091875076293945, - "kl": 4.015625, - "learning_rate": 5.277605825976195e-07, - "loss": 0.2954, - "num_tokens": 920638990.0, - "reward": 1.85009765625, - "reward_std": 0.4466114640235901, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.93994140625, - "rewards/tag_count_reward/std": 0.1667913943529129, + "grad_norm": 6.870009422302246, + "kl": 3.5859375, + "learning_rate": 5.280199843525429e-07, + "loss": 0.2239, + "num_tokens": 964803174.0, + "reward": 0.98974609375, + "reward_std": 0.22651655972003937, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.18584084510803223, "step": 1653 }, { @@ -47952,27 +47952,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1983.0, - "completions/mean_length": 823.515625, - "completions/mean_terminated_length": 776.3245239257812, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/max_terminated_length": 1916.0, + "completions/mean_length": 674.29296875, + "completions/mean_terminated_length": 671.6046752929688, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, "epoch": 0.5646496543483827, - "grad_norm": 2.001434803009033, - "kl": 6.2421875, - "learning_rate": 5.272251456481363e-07, - "loss": 0.4146, - "num_tokens": 921139206.0, - "reward": 1.78466796875, - "reward_std": 0.5764614343643188, - "rewards/accuracy_reward/mean": 0.05443548411130905, - "rewards/accuracy_reward/std": 0.2271040678024292, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.19614213705062866, + "grad_norm": 4.450105667114258, + "kl": 3.328125, + "learning_rate": 5.27484328892104e-07, + "loss": 0.1532, + "num_tokens": 965226988.0, + "reward": 0.99853515625, + "reward_std": 0.2641572058200836, + "rewards/accuracy_reward/mean": 0.06451612710952759, + "rewards/accuracy_reward/std": 0.2459181249141693, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.92236328125, + "rewards/tag_count_reward/std": 0.18950672447681427, "step": 1654 }, { @@ -47981,27 +47981,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 879.6640625, - "completions/mean_terminated_length": 837.0931396484375, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 745.22265625, + "completions/mean_terminated_length": 737.5442504882812, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.5649910386617735, - "grad_norm": 2.615372896194458, - "kl": 6.3359375, - "learning_rate": 5.266897410233934e-07, - "loss": 0.3738, - "num_tokens": 921662618.0, - "reward": 1.79833984375, - "reward_std": 0.5633862018585205, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.18665657937526703, + "grad_norm": 1.9903414249420166, + "kl": 2.892578125, + "learning_rate": 5.26948705412793e-07, + "loss": 0.1669, + "num_tokens": 965681566.0, + "reward": 1.001953125, + "reward_std": 0.2866523265838623, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.19297493994235992, "step": 1655 }, { @@ -48010,27 +48010,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1918.0, - "completions/mean_length": 843.025390625, - "completions/mean_terminated_length": 788.9244384765625, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 701.244140625, + "completions/mean_terminated_length": 695.9627685546875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, "epoch": 0.5653324229751643, - "grad_norm": 2.918985605239868, - "kl": 7.78125, - "learning_rate": 5.261543694832994e-07, - "loss": 0.4458, - "num_tokens": 922169671.0, - "reward": 1.7412109375, - "reward_std": 0.6137492656707764, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.796875, - "rewards/format_reward/std": 0.4027182459831238, - "rewards/tag_count_reward/mean": 0.9013671875, - "rewards/tag_count_reward/std": 0.21453560888767242, + "grad_norm": 2.3789944648742676, + "kl": 2.9140625, + "learning_rate": 5.264131146754067e-07, + "loss": 0.1592, + "num_tokens": 966116027.0, + "reward": 1.02001953125, + "reward_std": 0.23278766870498657, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.1780041754245758, "step": 1656 }, { @@ -48039,27 +48039,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 812.248046875, - "completions/mean_terminated_length": 764.6226806640625, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 703.81640625, + "completions/mean_terminated_length": 698.545166015625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, "epoch": 0.5656738072885551, - "grad_norm": 3.5504391193389893, - "kl": 7.1171875, - "learning_rate": 5.256190317877164e-07, - "loss": 0.4138, - "num_tokens": 922658262.0, - "reward": 1.822265625, - "reward_std": 0.5267001390457153, - "rewards/accuracy_reward/mean": 0.0947580635547638, - "rewards/accuracy_reward/std": 0.29317617416381836, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.19499453902244568, + "grad_norm": 2.7921977043151855, + "kl": 3.26953125, + "learning_rate": 5.258775574406948e-07, + "loss": 0.2053, + "num_tokens": 966549101.0, + "reward": 1.0712890625, + "reward_std": 0.2655341923236847, + "rewards/accuracy_reward/mean": 0.13104838132858276, + "rewards/accuracy_reward/std": 0.3377939760684967, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.17653599381446838, "step": 1657 }, { @@ -48068,27 +48068,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 820.455078125, - "completions/mean_terminated_length": 767.9531860351562, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/max_terminated_length": 1824.0, + "completions/mean_length": 690.478515625, + "completions/mean_terminated_length": 685.1549682617188, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.5660151916019459, - "grad_norm": 3.745274066925049, - "kl": 8.2265625, - "learning_rate": 5.250837286964585e-07, - "loss": 0.4714, - "num_tokens": 923160175.0, - "reward": 1.791015625, - "reward_std": 0.6207277178764343, - "rewards/accuracy_reward/mean": 0.0786290317773819, - "rewards/accuracy_reward/std": 0.26943066716194153, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.21215508878231049, + "grad_norm": 1.870984435081482, + "kl": 3.5234375, + "learning_rate": 5.253420344693598e-07, + "loss": 0.2072, + "num_tokens": 966984466.0, + "reward": 1.00390625, + "reward_std": 0.2924517095088959, + "rewards/accuracy_reward/mean": 0.08467742055654526, + "rewards/accuracy_reward/std": 0.278682142496109, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.20388682186603546, "step": 1658 }, { @@ -48097,27 +48097,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 897.849609375, - "completions/mean_terminated_length": 848.6578979492188, - "completions/min_length": 19.0, - "completions/min_terminated_length": 19.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 731.7578125, + "completions/mean_terminated_length": 724.0, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, "epoch": 0.5663565759153367, - "grad_norm": 5.2440876960754395, - "kl": 9.34375, - "learning_rate": 5.245484609692906e-07, - "loss": 0.5069, - "num_tokens": 923700306.0, - "reward": 1.693359375, - "reward_std": 0.5803141593933105, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.767578125, - "rewards/format_reward/std": 0.42278963327407837, - "rewards/tag_count_reward/mean": 0.88671875, - "rewards/tag_count_reward/std": 0.22580444812774658, + "grad_norm": 2.058933973312378, + "kl": 3.1875, + "learning_rate": 5.248065465220552e-07, + "loss": 0.17, + "num_tokens": 967439558.0, + "reward": 0.9775390625, + "reward_std": 0.25868144631385803, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.19395042955875397, "step": 1659 }, { @@ -48126,27 +48126,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 882.724609375, - "completions/mean_terminated_length": 802.4447021484375, - "completions/min_length": 49.0, - "completions/min_terminated_length": 49.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 738.7265625, + "completions/mean_terminated_length": 733.5922241210938, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.5666979602287275, - "grad_norm": 2.603729009628296, - "kl": 8.8125, - "learning_rate": 5.240132293659268e-07, - "loss": 0.5226, - "num_tokens": 924225749.0, - "reward": 1.69091796875, - "reward_std": 0.5992175936698914, - "rewards/accuracy_reward/mean": 0.01953125, - "rewards/accuracy_reward/std": 0.1385180652141571, - "rewards/format_reward/mean": 0.779296875, - "rewards/format_reward/std": 0.4151262938976288, - "rewards/tag_count_reward/mean": 0.89208984375, - "rewards/tag_count_reward/std": 0.22491775453090668, + "grad_norm": 2.0712203979492188, + "kl": 2.94921875, + "learning_rate": 5.242710943593852e-07, + "loss": 0.1711, + "num_tokens": 967891274.0, + "reward": 0.9794921875, + "reward_std": 0.17697089910507202, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.17971175909042358, "step": 1660 }, { @@ -48155,27 +48155,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 824.6171875, - "completions/mean_terminated_length": 780.04052734375, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1823.0, + "completions/max_terminated_length": 1823.0, + "completions/mean_length": 671.28515625, + "completions/mean_terminated_length": 671.28515625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.5670393445421182, - "grad_norm": 1.838841199874878, - "kl": 7.6953125, - "learning_rate": 5.234780346460314e-07, - "loss": 0.4604, - "num_tokens": 924729873.0, - "reward": 1.81298828125, - "reward_std": 0.6124602556228638, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.80078125, - "rewards/format_reward/std": 0.39980348944664, - "rewards/tag_count_reward/mean": 0.90869140625, - "rewards/tag_count_reward/std": 0.20232902467250824, + "grad_norm": 3.5400679111480713, + "kl": 2.34375, + "learning_rate": 5.237356787419028e-07, + "loss": 0.1438, + "num_tokens": 968316892.0, + "reward": 1.0859375, + "reward_std": 0.2900366187095642, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.18752038478851318, "step": 1661 }, { @@ -48184,27 +48184,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 824.201171875, - "completions/mean_terminated_length": 784.7237548828125, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 1854.0, + "completions/mean_length": 698.6875, + "completions/mean_terminated_length": 688.06298828125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.567380728855509, - "grad_norm": 0.8633550405502319, - "kl": 5.0078125, - "learning_rate": 5.229428775692146e-07, - "loss": 0.2933, - "num_tokens": 925226664.0, - "reward": 1.8515625, - "reward_std": 0.5280296802520752, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.18213331699371338, + "grad_norm": 2.1293344497680664, + "kl": 2.890625, + "learning_rate": 5.232003004301095e-07, + "loss": 0.16, + "num_tokens": 968749420.0, + "reward": 1.02392578125, + "reward_std": 0.2279701828956604, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.1780739426612854, "step": 1662 }, { @@ -48213,27 +48213,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 889.419921875, - "completions/mean_terminated_length": 842.3231201171875, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1738.0, + "completions/max_terminated_length": 1738.0, + "completions/mean_length": 737.484375, + "completions/mean_terminated_length": 737.484375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.5677221131688999, - "grad_norm": 4.076156139373779, - "kl": 4.75390625, - "learning_rate": 5.224077588950342e-07, - "loss": 0.3376, - "num_tokens": 925763871.0, - "reward": 1.857421875, - "reward_std": 0.5448045134544373, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.18686699867248535, + "grad_norm": 4.0087785720825195, + "kl": 3.02734375, + "learning_rate": 5.226649601844531e-07, + "loss": 0.1405, + "num_tokens": 969208836.0, + "reward": 1.06787109375, + "reward_std": 0.2967276871204376, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.17642973363399506, "step": 1663 }, { @@ -48242,27 +48242,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1926.0, - "completions/mean_length": 776.384765625, - "completions/mean_terminated_length": 743.2565307617188, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 684.5, + "completions/mean_terminated_length": 668.33203125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.5680634974822907, - "grad_norm": 1.9562805891036987, - "kl": 4.57421875, - "learning_rate": 5.218726793829936e-07, - "loss": 0.2657, - "num_tokens": 926248196.0, - "reward": 1.9189453125, - "reward_std": 0.5813266634941101, - "rewards/accuracy_reward/mean": 0.150390625, - "rewards/accuracy_reward/std": 0.35780346393585205, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.19661229848861694, + "grad_norm": 3.6469993591308594, + "kl": 3.40234375, + "learning_rate": 5.221296587653282e-07, + "loss": 0.2495, + "num_tokens": 969646116.0, + "reward": 1.091796875, + "reward_std": 0.3583020567893982, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3810062110424042, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.1964396834373474, "step": 1664 }, { @@ -48271,27 +48271,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1957.0, - "completions/mean_length": 872.9765625, - "completions/mean_terminated_length": 825.2113647460938, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 683.466796875, + "completions/mean_terminated_length": 680.7964477539062, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, "epoch": 0.5684048817956815, - "grad_norm": 1.9900144338607788, - "kl": 5.4140625, - "learning_rate": 5.213376397925399e-07, - "loss": 0.3612, - "num_tokens": 926761240.0, - "reward": 1.8974609375, - "reward_std": 0.48910754919052124, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.1807086020708084, + "grad_norm": 2.857988119125366, + "kl": 2.748046875, + "learning_rate": 5.215943969330735e-07, + "loss": 0.1201, + "num_tokens": 970062131.0, + "reward": 1.04345703125, + "reward_std": 0.28884726762771606, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.1779128909111023, "step": 1665 }, { @@ -48300,27 +48300,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 897.74609375, - "completions/mean_terminated_length": 823.61328125, - "completions/min_length": 68.0, - "completions/min_terminated_length": 68.0, + "completions/max_terminated_length": 1853.0, + "completions/mean_length": 736.939453125, + "completions/mean_terminated_length": 729.2122192382812, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.5687462661090723, - "grad_norm": 2.6536097526550293, - "kl": 6.1171875, - "learning_rate": 5.208026408830641e-07, - "loss": 0.4124, - "num_tokens": 927295526.0, - "reward": 1.77392578125, - "reward_std": 0.5496322512626648, - "rewards/accuracy_reward/mean": 0.03427419438958168, - "rewards/accuracy_reward/std": 0.18211627006530762, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.21884989738464355, + "grad_norm": 4.449905872344971, + "kl": 3.67578125, + "learning_rate": 5.210591754479718e-07, + "loss": 0.1914, + "num_tokens": 970514084.0, + "reward": 0.9990234375, + "reward_std": 0.2619768977165222, + "rewards/accuracy_reward/mean": 0.060483869165182114, + "rewards/accuracy_reward/std": 0.2386218160390854, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.19321990013122559, "step": 1666 }, { @@ -48329,27 +48329,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 847.388671875, - "completions/mean_terminated_length": 796.0387573242188, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/max_terminated_length": 1919.0, + "completions/mean_length": 722.7109375, + "completions/mean_terminated_length": 701.6746215820312, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.5690876504224631, - "grad_norm": 2.7732086181640625, - "kl": 6.015625, - "learning_rate": 5.202676834138993e-07, - "loss": 0.4163, - "num_tokens": 927803773.0, - "reward": 1.88623046875, - "reward_std": 0.5830451846122742, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.1927117109298706, + "grad_norm": 3.718538761138916, + "kl": 3.23828125, + "learning_rate": 5.205239950702488e-07, + "loss": 0.2077, + "num_tokens": 970958496.0, + "reward": 1.05712890625, + "reward_std": 0.2981846034526825, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.1766246110200882, "step": 1667 }, { @@ -48358,27 +48358,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 836.603515625, - "completions/mean_terminated_length": 810.0059814453125, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1924.0, + "completions/max_terminated_length": 1924.0, + "completions/mean_length": 704.0703125, + "completions/mean_terminated_length": 704.0703125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, "epoch": 0.5694290347358539, - "grad_norm": 1.5123244524002075, - "kl": 5.6328125, - "learning_rate": 5.1973276814432e-07, - "loss": 0.3307, - "num_tokens": 928318402.0, - "reward": 1.814453125, - "reward_std": 0.5502277612686157, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.19968174397945404, + "grad_norm": 4.298553466796875, + "kl": 2.466796875, + "learning_rate": 5.199888565600715e-07, + "loss": 0.1147, + "num_tokens": 971405268.0, + "reward": 1.03173828125, + "reward_std": 0.27260592579841614, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.94189453125, + "rewards/tag_count_reward/std": 0.16821186244487762, "step": 1668 }, { @@ -48387,27 +48387,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 823.88671875, - "completions/mean_terminated_length": 774.1259765625, - "completions/min_length": 196.0, - "completions/min_terminated_length": 196.0, + "completions/max_terminated_length": 1735.0, + "completions/mean_length": 678.70703125, + "completions/mean_terminated_length": 676.0274047851562, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, "epoch": 0.5697704190492446, - "grad_norm": 0.9966332912445068, - "kl": 6.6015625, - "learning_rate": 5.191978958335402e-07, - "loss": 0.3997, - "num_tokens": 928812200.0, - "reward": 1.82861328125, - "reward_std": 0.5804578065872192, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.2056826800107956, + "grad_norm": 3.9969301223754883, + "kl": 3.41796875, + "learning_rate": 5.194537606775473e-07, + "loss": 0.2078, + "num_tokens": 971824734.0, + "reward": 1.08984375, + "reward_std": 0.30127280950546265, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.16547498106956482, "step": 1669 }, { @@ -48416,27 +48416,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 951.396484375, - "completions/mean_terminated_length": 880.721435546875, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 1901.0, + "completions/mean_length": 776.201171875, + "completions/mean_terminated_length": 758.5723266601562, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.5701118033626354, - "grad_norm": 2.6709368228912354, - "kl": 8.8671875, - "learning_rate": 5.186630672407133e-07, - "loss": 0.5241, - "num_tokens": 929378563.0, - "reward": 1.7861328125, - "reward_std": 0.6257163286209106, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.8046875, - "rewards/format_reward/std": 0.3968288004398346, - "rewards/tag_count_reward/mean": 0.9033203125, - "rewards/tag_count_reward/std": 0.21936286985874176, + "grad_norm": 4.4810967445373535, + "kl": 3.6328125, + "learning_rate": 5.189187081827237e-07, + "loss": 0.2389, + "num_tokens": 972301397.0, + "reward": 1.04150390625, + "reward_std": 0.2942211329936981, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.19531220197677612, "step": 1670 }, { @@ -48445,27 +48445,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 848.033203125, - "completions/mean_terminated_length": 775.9855346679688, - "completions/min_length": 12.0, - "completions/min_terminated_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1761.0, + "completions/max_terminated_length": 1761.0, + "completions/mean_length": 683.1484375, + "completions/mean_terminated_length": 683.1484375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.5704531876760263, - "grad_norm": 3.107811689376831, - "kl": 10.359375, - "learning_rate": 5.181282831249311e-07, - "loss": 0.6312, - "num_tokens": 929890372.0, - "reward": 1.78662109375, - "reward_std": 0.6742834448814392, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.89013671875, - "rewards/tag_count_reward/std": 0.2382572442293167, + "grad_norm": 2.4237821102142334, + "kl": 2.83984375, + "learning_rate": 5.183836998355857e-07, + "loss": 0.1296, + "num_tokens": 972728785.0, + "reward": 1.04736328125, + "reward_std": 0.2979637384414673, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17106766998767853, "step": 1671 }, { @@ -48474,27 +48474,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 836.806640625, - "completions/mean_terminated_length": 795.2101440429688, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 723.416015625, + "completions/mean_terminated_length": 691.6260375976562, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, "epoch": 0.5707945719894171, - "grad_norm": 2.627119302749634, - "kl": 7.640625, - "learning_rate": 5.175935442452213e-07, - "loss": 0.462, - "num_tokens": 930398993.0, - "reward": 1.77978515625, - "reward_std": 0.5478862524032593, - "rewards/accuracy_reward/mean": 0.038306452333927155, - "rewards/accuracy_reward/std": 0.19212883710861206, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.19551268219947815, + "grad_norm": 2.645691394805908, + "kl": 3.9296875, + "learning_rate": 5.178487363960563e-07, + "loss": 0.252, + "num_tokens": 973179350.0, + "reward": 0.9921875, + "reward_std": 0.26792672276496887, + "rewards/accuracy_reward/mean": 0.06854838877916336, + "rewards/accuracy_reward/std": 0.25293973088264465, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.19604040682315826, "step": 1672 }, { @@ -48503,27 +48503,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1884.0, - "completions/mean_length": 862.94140625, - "completions/mean_terminated_length": 819.7611694335938, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1806.0, + "completions/max_terminated_length": 1806.0, + "completions/mean_length": 712.76953125, + "completions/mean_terminated_length": 712.76953125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.5711359563028079, - "grad_norm": 2.356896162033081, - "kl": 6.875, - "learning_rate": 5.170588513605485e-07, - "loss": 0.3958, - "num_tokens": 930923731.0, - "reward": 1.8046875, - "reward_std": 0.5468517541885376, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.91015625, - "rewards/tag_count_reward/std": 0.21555092930793762, + "grad_norm": 2.53078556060791, + "kl": 2.48046875, + "learning_rate": 5.173138186239943e-07, + "loss": 0.138, + "num_tokens": 973627200.0, + "reward": 1.0751953125, + "reward_std": 0.2726728916168213, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.1628410518169403, "step": 1673 }, { @@ -48532,27 +48532,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1956.0, - "completions/mean_length": 857.18359375, - "completions/mean_terminated_length": 793.4773559570312, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 1805.0, + "completions/mean_length": 701.83203125, + "completions/mean_terminated_length": 696.552978515625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.5714773406161987, - "grad_norm": 1.1741122007369995, - "kl": 7.671875, - "learning_rate": 5.165242052298112e-07, - "loss": 0.4698, - "num_tokens": 931438017.0, - "reward": 1.7880859375, - "reward_std": 0.6110674142837524, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.2143038660287857, + "grad_norm": 2.060983657836914, + "kl": 3.14453125, + "learning_rate": 5.167789472791942e-07, + "loss": 0.1736, + "num_tokens": 974061946.0, + "reward": 1.005859375, + "reward_std": 0.26487523317337036, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.18718376755714417, "step": 1674 }, { @@ -48561,27 +48561,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 843.484375, - "completions/mean_terminated_length": 786.8302612304688, - "completions/min_length": 8.0, - "completions/min_terminated_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2006.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 709.5703125, + "completions/mean_terminated_length": 709.5703125, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.5718187249295895, - "grad_norm": 0.7875505685806274, - "kl": 6.90625, - "learning_rate": 5.159896066118417e-07, - "loss": 0.4293, - "num_tokens": 931949369.0, - "reward": 1.83056640625, - "reward_std": 0.5713907480239868, - "rewards/accuracy_reward/mean": 0.07083333283662796, - "rewards/accuracy_reward/std": 0.2568138837814331, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.20863474905490875, + "grad_norm": 4.357063293457031, + "kl": 2.55859375, + "learning_rate": 5.162441231213834e-07, + "loss": 0.1463, + "num_tokens": 974504734.0, + "reward": 1.04443359375, + "reward_std": 0.24250677227973938, + "rewards/accuracy_reward/mean": 0.1041666641831398, + "rewards/accuracy_reward/std": 0.3057953417301178, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.1663554310798645, "step": 1675 }, { @@ -48590,27 +48590,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 864.046875, - "completions/mean_terminated_length": 785.11669921875, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 1907.0, + "completions/mean_length": 693.94921875, + "completions/mean_terminated_length": 685.9685668945312, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.5721601092429803, - "grad_norm": 2.757953405380249, - "kl": 7.40625, - "learning_rate": 5.15455056265405e-07, - "loss": 0.5266, - "num_tokens": 932473153.0, - "reward": 1.81494140625, - "reward_std": 0.5837230682373047, - "rewards/accuracy_reward/mean": 0.06451612710952759, - "rewards/accuracy_reward/std": 0.2459181249141693, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.2078409045934677, + "grad_norm": 2.776296615600586, + "kl": 3.671875, + "learning_rate": 5.157093469102236e-07, + "loss": 0.223, + "num_tokens": 974941428.0, + "reward": 0.9970703125, + "reward_std": 0.2664055824279785, + "rewards/accuracy_reward/mean": 0.060483869165182114, + "rewards/accuracy_reward/std": 0.2386218160390854, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.18964596092700958, "step": 1676 }, { @@ -48619,27 +48619,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 829.92578125, - "completions/mean_terminated_length": 795.6826782226562, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2023.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 677.474609375, + "completions/mean_terminated_length": 677.474609375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.572501493556371, - "grad_norm": 2.9485995769500732, - "kl": 4.1484375, - "learning_rate": 5.149205549491975e-07, - "loss": 0.2912, - "num_tokens": 932972811.0, - "reward": 1.93408203125, - "reward_std": 0.4794820547103882, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17322689294815063, + "grad_norm": 2.1158411502838135, + "kl": 2.7109375, + "learning_rate": 5.151746194053077e-07, + "loss": 0.1791, + "num_tokens": 975363031.0, + "reward": 1.09912109375, + "reward_std": 0.29495304822921753, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.95068359375, + "rewards/tag_count_reward/std": 0.1568230390548706, "step": 1677 }, { @@ -48648,27 +48648,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 866.876953125, - "completions/mean_terminated_length": 806.244384765625, - "completions/min_length": 197.0, - "completions/min_terminated_length": 197.0, + "completions/max_terminated_length": 1929.0, + "completions/mean_length": 702.82421875, + "completions/mean_terminated_length": 697.549072265625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.5728428778697618, - "grad_norm": 2.8530094623565674, - "kl": 6.765625, - "learning_rate": 5.143861034218462e-07, - "loss": 0.4756, - "num_tokens": 933490908.0, - "reward": 1.81005859375, - "reward_std": 0.5465816259384155, - "rewards/accuracy_reward/mean": 0.05645161122083664, - "rewards/accuracy_reward/std": 0.23102475702762604, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20999513566493988, + "grad_norm": 1.5915335416793823, + "kl": 3.359375, + "learning_rate": 5.146399413661595e-07, + "loss": 0.1959, + "num_tokens": 975797133.0, + "reward": 1.0087890625, + "reward_std": 0.26261550188064575, + "rewards/accuracy_reward/mean": 0.07661290466785431, + "rewards/accuracy_reward/std": 0.2662447690963745, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.18546834588050842, "step": 1678 }, { @@ -48677,27 +48677,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 844.345703125, - "completions/mean_terminated_length": 792.8656005859375, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 1789.0, + "completions/mean_length": 712.98828125, + "completions/mean_terminated_length": 707.7529907226562, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.5731842621831527, - "grad_norm": 2.1478755474090576, - "kl": 6.59375, - "learning_rate": 5.138517024419071e-07, - "loss": 0.4221, - "num_tokens": 934008301.0, - "reward": 1.80029296875, - "reward_std": 0.5182666778564453, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.1960885375738144, + "grad_norm": 3.040320873260498, + "kl": 2.703125, + "learning_rate": 5.141053135522324e-07, + "loss": 0.1546, + "num_tokens": 976247271.0, + "reward": 1.01904296875, + "reward_std": 0.23498836159706116, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.169349804520607, "step": 1679 }, { @@ -48706,27 +48706,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 815.40234375, - "completions/mean_terminated_length": 775.64111328125, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 729.466796875, + "completions/mean_terminated_length": 724.296142578125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.5735256464965435, - "grad_norm": 2.8920838832855225, - "kl": 5.71875, - "learning_rate": 5.13317352767865e-07, - "loss": 0.4276, - "num_tokens": 934500219.0, - "reward": 1.87109375, - "reward_std": 0.49277210235595703, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.17274148762226105, + "grad_norm": 5.291203498840332, + "kl": 2.734375, + "learning_rate": 5.13570736722909e-07, + "loss": 0.1791, + "num_tokens": 976695190.0, + "reward": 1.02099609375, + "reward_std": 0.25781625509262085, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.94482421875, + "rewards/tag_count_reward/std": 0.15952341258525848, "step": 1680 }, { @@ -48735,27 +48735,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 855.181640625, - "completions/mean_terminated_length": 780.9398803710938, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 724.75, + "completions/mean_terminated_length": 716.950927734375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.5738670308099343, - "grad_norm": 1.3681129217147827, - "kl": 7.6953125, - "learning_rate": 5.127830551581311e-07, - "loss": 0.471, - "num_tokens": 935022392.0, - "reward": 1.79150390625, - "reward_std": 0.5700950622558594, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.90478515625, - "rewards/tag_count_reward/std": 0.22194115817546844, + "grad_norm": 2.1247153282165527, + "kl": 2.56640625, + "learning_rate": 5.130362116374989e-07, + "loss": 0.1758, + "num_tokens": 977150582.0, + "reward": 1.0576171875, + "reward_std": 0.27043211460113525, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.17023125290870667, "step": 1681 }, { @@ -48764,27 +48764,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1932.0, - "completions/mean_length": 890.224609375, - "completions/mean_terminated_length": 820.7101440429688, - "completions/min_length": 60.0, - "completions/min_terminated_length": 60.0, + "completions/max_terminated_length": 1902.0, + "completions/mean_length": 770.732421875, + "completions/mean_terminated_length": 763.204345703125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, "epoch": 0.5742084151233251, - "grad_norm": 4.133235454559326, - "kl": 9.375, - "learning_rate": 5.122488103710435e-07, - "loss": 0.5516, - "num_tokens": 935552955.0, - "reward": 1.77099609375, - "reward_std": 0.5811473727226257, - "rewards/accuracy_reward/mean": 0.060483869165182114, - "rewards/accuracy_reward/std": 0.2386218160390854, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.22065876424312592, + "grad_norm": 3.666898250579834, + "kl": 3.15234375, + "learning_rate": 5.125017390552383e-07, + "loss": 0.1993, + "num_tokens": 977619965.0, + "reward": 1.0322265625, + "reward_std": 0.2963072955608368, + "rewards/accuracy_reward/mean": 0.08669354766607285, + "rewards/accuracy_reward/std": 0.281669557094574, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.17588526010513306, "step": 1682 }, { @@ -48793,27 +48793,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 836.12109375, - "completions/mean_terminated_length": 773.90966796875, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 1745.0, + "completions/mean_length": 715.56640625, + "completions/mean_terminated_length": 707.7131958007812, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.5745497994367159, - "grad_norm": 1.4778292179107666, - "kl": 6.5078125, - "learning_rate": 5.117146191648647e-07, - "loss": 0.4152, - "num_tokens": 936056713.0, - "reward": 1.798828125, - "reward_std": 0.493483304977417, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.19736173748970032, + "grad_norm": 5.406630039215088, + "kl": 3.10546875, + "learning_rate": 5.11967319735289e-07, + "loss": 0.2202, + "num_tokens": 978061999.0, + "reward": 0.9951171875, + "reward_std": 0.23086810111999512, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.16953378915786743, "step": 1683 }, { @@ -48822,27 +48822,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 853.40234375, - "completions/mean_terminated_length": 799.767333984375, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 1909.0, + "completions/mean_length": 742.46875, + "completions/mean_terminated_length": 734.7741088867188, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.5748911837501067, - "grad_norm": 3.5044214725494385, - "kl": 8.3828125, - "learning_rate": 5.111804822977814e-07, - "loss": 0.5136, - "num_tokens": 936565591.0, - "reward": 1.8291015625, - "reward_std": 0.5825339555740356, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.21342815458774567, + "grad_norm": 3.937920331954956, + "kl": 3.56640625, + "learning_rate": 5.114329544367374e-07, + "loss": 0.2379, + "num_tokens": 978514079.0, + "reward": 1.076171875, + "reward_std": 0.28781288862228394, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.18652920424938202, "step": 1684 }, { @@ -48851,27 +48851,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 775.412109375, - "completions/mean_terminated_length": 739.6365356445312, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 651.0546875, + "completions/mean_terminated_length": 645.5765380859375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.5752325680634974, - "grad_norm": 1.9510211944580078, - "kl": 5.9609375, - "learning_rate": 5.106464005279034e-07, - "loss": 0.3894, - "num_tokens": 937036314.0, - "reward": 1.90869140625, - "reward_std": 0.5322166681289673, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.18794220685958862, + "grad_norm": 1.7381266355514526, + "kl": 2.435546875, + "learning_rate": 5.108986439185923e-07, + "loss": 0.168, + "num_tokens": 978921131.0, + "reward": 1.1142578125, + "reward_std": 0.2523620128631592, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9521484375, + "rewards/tag_count_reward/std": 0.15213829278945923, "step": 1685 }, { @@ -48880,27 +48880,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 768.40234375, - "completions/mean_terminated_length": 705.4712524414062, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_terminated_length": 1629.0, + "completions/mean_length": 661.560546875, + "completions/mean_terminated_length": 658.8473510742188, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, "epoch": 0.5755739523768882, - "grad_norm": 1.8461062908172607, - "kl": 7.65625, - "learning_rate": 5.101123746132622e-07, - "loss": 0.492, - "num_tokens": 937505576.0, - "reward": 1.84912109375, - "reward_std": 0.5417767763137817, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.20101657509803772, + "grad_norm": 6.371468544006348, + "kl": 3.48828125, + "learning_rate": 5.103643889397858e-07, + "loss": 0.1871, + "num_tokens": 979335690.0, + "reward": 1.0849609375, + "reward_std": 0.30169448256492615, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.17239542305469513, "step": 1686 }, { @@ -48909,27 +48909,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 860.7890625, - "completions/mean_terminated_length": 792.107421875, - "completions/min_length": 214.0, - "completions/min_terminated_length": 214.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 707.240234375, + "completions/mean_terminated_length": 704.616455078125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.5759153366902791, - "grad_norm": 1.1507009267807007, - "kl": 7.0703125, - "learning_rate": 5.095784053118094e-07, - "loss": 0.4438, - "num_tokens": 938025740.0, - "reward": 1.78466796875, - "reward_std": 0.5180612206459045, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.203689306974411, + "grad_norm": 3.417599678039551, + "kl": 4.484375, + "learning_rate": 5.098301902591703e-07, + "loss": 0.2944, + "num_tokens": 979777237.0, + "reward": 1.02734375, + "reward_std": 0.2443060427904129, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.16771192848682404, "step": 1687 }, { @@ -48938,27 +48938,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 847.638671875, - "completions/mean_terminated_length": 798.8434448242188, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 1778.0, + "completions/mean_length": 707.6953125, + "completions/mean_terminated_length": 702.4392700195312, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, "epoch": 0.5762567210036699, - "grad_norm": 2.3126566410064697, - "kl": 6.3125, - "learning_rate": 5.090444933814171e-07, - "loss": 0.4363, - "num_tokens": 938552275.0, - "reward": 1.7841796875, - "reward_std": 0.5397467613220215, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.193694069981575, + "grad_norm": 2.682257652282715, + "kl": 3.2412109375, + "learning_rate": 5.092960486355183e-07, + "loss": 0.1709, + "num_tokens": 980232121.0, + "reward": 1.0068359375, + "reward_std": 0.21877656877040863, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.16201746463775635, "step": 1688 }, { @@ -48967,27 +48967,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 813.15234375, - "completions/mean_terminated_length": 768.1578979492188, - "completions/min_length": 182.0, - "completions/min_terminated_length": 182.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 715.8046875, + "completions/mean_terminated_length": 697.338623046875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.5765981053170607, - "grad_norm": 1.8903048038482666, - "kl": 5.06640625, - "learning_rate": 5.085106395798756e-07, - "loss": 0.3525, - "num_tokens": 939049937.0, - "reward": 1.93359375, - "reward_std": 0.4996389150619507, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.17060412466526031, + "grad_norm": 6.733412742614746, + "kl": 5.1328125, + "learning_rate": 5.087619648275217e-07, + "loss": 0.2863, + "num_tokens": 980679941.0, + "reward": 1.056640625, + "reward_std": 0.32496365904808044, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.1840227097272873, "step": 1689 }, { @@ -48996,27 +48996,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1924.0, - "completions/mean_length": 900.833984375, - "completions/mean_terminated_length": 821.8016967773438, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/max_terminated_length": 1896.0, + "completions/mean_length": 789.392578125, + "completions/mean_terminated_length": 784.4569091796875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, "epoch": 0.5769394896304515, - "grad_norm": 1.2121962308883667, - "kl": 7.21875, - "learning_rate": 5.079768446648926e-07, - "loss": 0.4765, - "num_tokens": 939583628.0, - "reward": 1.75390625, - "reward_std": 0.6164195537567139, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.7890625, - "rewards/format_reward/std": 0.4083731174468994, - "rewards/tag_count_reward/mean": 0.900390625, - "rewards/tag_count_reward/std": 0.21579019725322723, + "grad_norm": 2.947176218032837, + "kl": 3.984375, + "learning_rate": 5.082279395937903e-07, + "loss": 0.2323, + "num_tokens": 981156574.0, + "reward": 1.01318359375, + "reward_std": 0.27712419629096985, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.18807943165302277, "step": 1690 }, { @@ -49025,27 +49025,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 801.580078125, - "completions/mean_terminated_length": 742.9550170898438, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 703.67578125, + "completions/mean_terminated_length": 695.7525024414062, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.5772808739438423, - "grad_norm": 1.0426567792892456, - "kl": 5.421875, - "learning_rate": 5.074431093940927e-07, - "loss": 0.3279, - "num_tokens": 940070069.0, - "reward": 1.841796875, - "reward_std": 0.5356305837631226, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.20497123897075653, + "grad_norm": 3.084765911102295, + "kl": 3.92578125, + "learning_rate": 5.076939736928497e-07, + "loss": 0.2381, + "num_tokens": 981592888.0, + "reward": 1.03759765625, + "reward_std": 0.2766039967536926, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.184763103723526, "step": 1691 }, { @@ -49054,27 +49054,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 825.408203125, - "completions/mean_terminated_length": 773.1181640625, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 1759.0, + "completions/mean_length": 694.015625, + "completions/mean_terminated_length": 688.7059326171875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, "epoch": 0.5776222582572331, - "grad_norm": 1.4213882684707642, - "kl": 5.96875, - "learning_rate": 5.069094345250152e-07, - "loss": 0.3999, - "num_tokens": 940563334.0, - "reward": 1.83447265625, - "reward_std": 0.5573749542236328, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.20686857402324677, + "grad_norm": 2.7203426361083984, + "kl": 3.5234375, + "learning_rate": 5.071600678831427e-07, + "loss": 0.2209, + "num_tokens": 982018880.0, + "reward": 1.037109375, + "reward_std": 0.2785555124282837, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.1847999542951584, "step": 1692 }, { @@ -49083,27 +49083,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1952.0, - "completions/mean_length": 771.3515625, - "completions/mean_terminated_length": 727.507080078125, - "completions/min_length": 114.0, - "completions/min_terminated_length": 114.0, + "completions/max_terminated_length": 1873.0, + "completions/mean_length": 663.845703125, + "completions/mean_terminated_length": 655.6876220703125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, "epoch": 0.5779636425706238, - "grad_norm": 2.150052547454834, - "kl": 5.109375, - "learning_rate": 5.063758208151139e-07, - "loss": 0.3768, - "num_tokens": 941031626.0, - "reward": 1.94140625, - "reward_std": 0.5016964673995972, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.18501698970794678, + "grad_norm": 2.53839373588562, + "kl": 2.47265625, + "learning_rate": 5.066262229230254e-07, + "loss": 0.1453, + "num_tokens": 982432129.0, + "reward": 1.0986328125, + "reward_std": 0.22677066922187805, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.16655419766902924, "step": 1693 }, { @@ -49112,27 +49112,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 819.5234375, - "completions/mean_terminated_length": 743.062255859375, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 717.2265625, + "completions/mean_terminated_length": 704.1026000976562, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.5783050268840146, - "grad_norm": 1.2886264324188232, - "kl": 5.40625, - "learning_rate": 5.058422690217559e-07, - "loss": 0.3455, - "num_tokens": 941536326.0, - "reward": 1.86865234375, - "reward_std": 0.49906250834465027, - "rewards/accuracy_reward/mean": 0.07459677755832672, - "rewards/accuracy_reward/std": 0.263004869222641, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.18445254862308502, + "grad_norm": 2.3313944339752197, + "kl": 4.12109375, + "learning_rate": 5.060924395707685e-07, + "loss": 0.2619, + "num_tokens": 982884453.0, + "reward": 1.02392578125, + "reward_std": 0.24411720037460327, + "rewards/accuracy_reward/mean": 0.08669354766607285, + "rewards/accuracy_reward/std": 0.281669557094574, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.17731572687625885, "step": 1694 }, { @@ -49141,27 +49141,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 911.7421875, - "completions/mean_terminated_length": 848.4866333007812, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 1831.0, + "completions/mean_length": 743.892578125, + "completions/mean_terminated_length": 741.3405151367188, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.5786464111974055, - "grad_norm": 1.5921692848205566, - "kl": 7.015625, - "learning_rate": 5.053087799022207e-07, - "loss": 0.4269, - "num_tokens": 942081234.0, - "reward": 1.74951171875, - "reward_std": 0.5302519202232361, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.90576171875, - "rewards/tag_count_reward/std": 0.21847409009933472, + "grad_norm": 2.4283440113067627, + "kl": 1.935546875, + "learning_rate": 5.055587185845545e-07, + "loss": 0.1197, + "num_tokens": 983343422.0, + "reward": 1.01318359375, + "reward_std": 0.1927071213722229, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.95849609375, + "rewards/tag_count_reward/std": 0.13494610786437988, "step": 1695 }, { @@ -49170,27 +49170,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1804.0, - "completions/mean_length": 732.23828125, - "completions/mean_terminated_length": 700.6600341796875, - "completions/min_length": 115.0, - "completions/min_terminated_length": 115.0, + "completions/max_terminated_length": 1856.0, + "completions/mean_length": 647.02734375, + "completions/mean_terminated_length": 641.5333862304688, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.5789877955107963, - "grad_norm": 3.2324378490448, - "kl": 5.046875, - "learning_rate": 5.047753542136981e-07, - "loss": 0.3475, - "num_tokens": 942526796.0, - "reward": 1.8984375, - "reward_std": 0.5194200873374939, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.16834884881973267, + "grad_norm": 6.847288608551025, + "kl": 2.84765625, + "learning_rate": 5.05025060722478e-07, + "loss": 0.2125, + "num_tokens": 983745356.0, + "reward": 1.03662109375, + "reward_std": 0.2541506886482239, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.1729232519865036, "step": 1696 }, { @@ -49199,27 +49199,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 866.65625, - "completions/mean_terminated_length": 813.6162719726562, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1871.0, + "completions/max_terminated_length": 1871.0, + "completions/mean_length": 723.41796875, + "completions/mean_terminated_length": 723.41796875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.5793291798241871, - "grad_norm": 2.79946231842041, - "kl": 5.9140625, - "learning_rate": 5.042419927132886e-07, - "loss": 0.3651, - "num_tokens": 943055228.0, - "reward": 1.83203125, - "reward_std": 0.5605237483978271, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.2096906155347824, + "grad_norm": 2.8784735202789307, + "kl": 1.892578125, + "learning_rate": 5.044914667425427e-07, + "loss": 0.1116, + "num_tokens": 984200450.0, + "reward": 1.0419921875, + "reward_std": 0.2304690033197403, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.1709705889225006, "step": 1697 }, { @@ -49228,27 +49228,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 821.318359375, - "completions/mean_terminated_length": 791.8780517578125, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 1797.0, + "completions/mean_length": 719.849609375, + "completions/mean_terminated_length": 712.0216064453125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.5796705641375779, - "grad_norm": 1.3076624870300293, - "kl": 6.078125, - "learning_rate": 5.037086961580012e-07, - "loss": 0.3656, - "num_tokens": 943555407.0, - "reward": 1.85546875, - "reward_std": 0.5533447861671448, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.18752038478851318, + "grad_norm": 2.2479898929595947, + "kl": 2.49609375, + "learning_rate": 5.039579374026633e-07, + "loss": 0.1552, + "num_tokens": 984648677.0, + "reward": 1.04150390625, + "reward_std": 0.2523299753665924, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.94384765625, + "rewards/tag_count_reward/std": 0.15763740241527557, "step": 1698 }, { @@ -49257,27 +49257,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 859.642578125, - "completions/mean_terminated_length": 793.4866333007812, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 1852.0, + "completions/mean_length": 728.03125, + "completions/mean_terminated_length": 725.4481201171875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.5800119484509687, - "grad_norm": 2.272294521331787, - "kl": 6.4921875, - "learning_rate": 5.031754653047528e-07, - "loss": 0.375, - "num_tokens": 944063832.0, - "reward": 1.8642578125, - "reward_std": 0.5668913125991821, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.2129799723625183, + "grad_norm": 5.775050640106201, + "kl": 2.51953125, + "learning_rate": 5.034244734606612e-07, + "loss": 0.1779, + "num_tokens": 985089717.0, + "reward": 1.10693359375, + "reward_std": 0.29803240299224854, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.16338804364204407, "step": 1699 }, { @@ -49286,27 +49286,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1969.0, - "completions/mean_length": 864.37109375, - "completions/mean_terminated_length": 808.6993408203125, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 722.53515625, + "completions/mean_terminated_length": 714.7230224609375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, "epoch": 0.5803533327643595, - "grad_norm": 1.033617377281189, - "kl": 6.515625, - "learning_rate": 5.02642300910367e-07, - "loss": 0.3874, - "num_tokens": 944594998.0, - "reward": 1.81494140625, - "reward_std": 0.5386810898780823, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.1984763890504837, + "grad_norm": 3.4308431148529053, + "kl": 3.125, + "learning_rate": 5.028910756742655e-07, + "loss": 0.2067, + "num_tokens": 985548263.0, + "reward": 1.0634765625, + "reward_std": 0.27786415815353394, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.17448893189430237, "step": 1700 }, { @@ -49315,27 +49315,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 885.51171875, - "completions/mean_terminated_length": 848.0120849609375, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1972.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 724.4921875, + "completions/mean_terminated_length": 724.4921875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, "epoch": 0.5806947170777502, - "grad_norm": 1.0708470344543457, - "kl": 5.8671875, - "learning_rate": 5.021092037315733e-07, - "loss": 0.349, - "num_tokens": 945124076.0, - "reward": 1.80029296875, - "reward_std": 0.5660788416862488, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.19340462982654572, + "grad_norm": 3.092909574508667, + "kl": 2.15234375, + "learning_rate": 5.023577448011116e-07, + "loss": 0.0926, + "num_tokens": 985994899.0, + "reward": 1.01806640625, + "reward_std": 0.22545407712459564, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.94775390625, + "rewards/tag_count_reward/std": 0.15349750220775604, "step": 1701 }, { @@ -49344,27 +49344,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 881.14453125, - "completions/mean_terminated_length": 813.6404418945312, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 732.69921875, + "completions/mean_terminated_length": 730.125244140625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.581036101391141, - "grad_norm": 1.2517772912979126, - "kl": 7.6171875, - "learning_rate": 5.015761745250055e-07, - "loss": 0.4515, - "num_tokens": 945645606.0, - "reward": 1.78955078125, - "reward_std": 0.6008893847465515, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.20643390715122223, + "grad_norm": 2.8426170349121094, + "kl": 3.0703125, + "learning_rate": 5.018244815987395e-07, + "loss": 0.1599, + "num_tokens": 986440425.0, + "reward": 1.05224609375, + "reward_std": 0.27782613039016724, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.1588330715894699, "step": 1702 }, { @@ -49373,27 +49373,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 856.904296875, - "completions/mean_terminated_length": 813.5040893554688, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 1726.0, + "completions/mean_length": 714.419921875, + "completions/mean_terminated_length": 709.1902465820312, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.5813774857045318, - "grad_norm": 2.9970943927764893, - "kl": 7.046875, - "learning_rate": 5.01043214047201e-07, - "loss": 0.392, - "num_tokens": 946160821.0, - "reward": 1.82958984375, - "reward_std": 0.5553572177886963, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20408765971660614, + "grad_norm": 6.054725646972656, + "kl": 3.40234375, + "learning_rate": 5.012912868245927e-07, + "loss": 0.131, + "num_tokens": 986882688.0, + "reward": 1.07421875, + "reward_std": 0.3116151690483093, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.1774672269821167, "step": 1703 }, { @@ -49402,27 +49402,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 835.712890625, - "completions/mean_terminated_length": 796.6068115234375, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1681.0, + "completions/max_terminated_length": 1681.0, + "completions/mean_length": 712.48046875, + "completions/mean_terminated_length": 712.48046875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.5817188700179227, - "grad_norm": 2.7404091358184814, - "kl": 7.390625, - "learning_rate": 5.005103230546e-07, - "loss": 0.4008, - "num_tokens": 946661634.0, - "reward": 1.7939453125, - "reward_std": 0.5862789154052734, - "rewards/accuracy_reward/mean": 0.06854838877916336, - "rewards/accuracy_reward/std": 0.25293970108032227, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.21054330468177795, + "grad_norm": 2.780832290649414, + "kl": 2.255859375, + "learning_rate": 5.007581612360185e-07, + "loss": 0.1205, + "num_tokens": 987320406.0, + "reward": 1.0771484375, + "reward_std": 0.2293703705072403, + "rewards/accuracy_reward/mean": 0.11491935700178146, + "rewards/accuracy_reward/std": 0.3192465901374817, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9541015625, + "rewards/tag_count_reward/std": 0.14280793070793152, "step": 1704 }, { @@ -49431,27 +49431,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 795.357421875, - "completions/mean_terminated_length": 757.55126953125, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1924.0, + "completions/max_terminated_length": 1924.0, + "completions/mean_length": 651.3203125, + "completions/mean_terminated_length": 651.3203125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.5820602543313135, - "grad_norm": 2.0220682621002197, - "kl": 6.984375, - "learning_rate": 4.999775023035438e-07, - "loss": 0.3819, - "num_tokens": 947144441.0, - "reward": 1.91552734375, - "reward_std": 0.6095578670501709, - "rewards/accuracy_reward/mean": 0.177734375, - "rewards/accuracy_reward/std": 0.3826628625392914, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.19944173097610474, + "grad_norm": 2.2586894035339355, + "kl": 2.5, + "learning_rate": 5.002251055902651e-07, + "loss": 0.1504, + "num_tokens": 987729466.0, + "reward": 1.20263671875, + "reward_std": 0.31580495834350586, + "rewards/accuracy_reward/mean": 0.244140625, + "rewards/accuracy_reward/std": 0.42999663949012756, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.95068359375, + "rewards/tag_count_reward/std": 0.15207155048847198, "step": 1705 }, { @@ -49460,27 +49460,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 829.986328125, - "completions/mean_terminated_length": 795.7449340820312, - "completions/min_length": 45.0, - "completions/min_terminated_length": 45.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1902.0, + "completions/max_terminated_length": 1902.0, + "completions/mean_length": 686.26953125, + "completions/mean_terminated_length": 686.26953125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, "epoch": 0.5824016386447043, - "grad_norm": 2.52016019821167, - "kl": 5.5703125, - "learning_rate": 4.994447525502735e-07, - "loss": 0.3674, - "num_tokens": 947649106.0, - "reward": 1.85400390625, - "reward_std": 0.5111713409423828, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, + "grad_norm": 3.3260016441345215, + "kl": 2.6328125, + "learning_rate": 4.996921206444818e-07, + "loss": 0.1547, + "num_tokens": 988160548.0, + "reward": 1.04541015625, + "reward_std": 0.2555238604545593, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.18540844321250916, + "rewards/tag_count_reward/std": 0.1793731451034546, "step": 1706 }, { @@ -49489,27 +49489,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 858.845703125, - "completions/mean_terminated_length": 805.455078125, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 686.24609375, + "completions/mean_terminated_length": 683.5812377929688, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, "epoch": 0.5827430229580951, - "grad_norm": 2.769322633743286, - "kl": 6.6796875, - "learning_rate": 4.989120745509305e-07, - "loss": 0.466, - "num_tokens": 948168035.0, - "reward": 1.78564453125, - "reward_std": 0.6224857568740845, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.89892578125, - "rewards/tag_count_reward/std": 0.22099627554416656, + "grad_norm": 5.524597644805908, + "kl": 3.505859375, + "learning_rate": 4.991592071557171e-07, + "loss": 0.1994, + "num_tokens": 988591106.0, + "reward": 1.0751953125, + "reward_std": 0.28985828161239624, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.021484375, + "rewards/format_reward/std": 0.14513419568538666, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.1723288893699646, "step": 1707 }, { @@ -49518,27 +49518,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1929.0, - "completions/mean_length": 824.849609375, - "completions/mean_terminated_length": 782.8424682617188, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/max_terminated_length": 1413.0, + "completions/mean_length": 669.099609375, + "completions/mean_terminated_length": 663.6921997070312, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.5830844072714859, - "grad_norm": 3.5896201133728027, - "kl": 4.9921875, - "learning_rate": 4.983794690615535e-07, - "loss": 0.3309, - "num_tokens": 948672582.0, - "reward": 1.91357421875, - "reward_std": 0.5080065727233887, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.19360215961933136, + "grad_norm": 1.578294038772583, + "kl": 2.748046875, + "learning_rate": 4.986263658809185e-07, + "loss": 0.1673, + "num_tokens": 989015909.0, + "reward": 1.11279296875, + "reward_std": 0.24973051249980927, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.95849609375, + "rewards/tag_count_reward/std": 0.14114809036254883, "step": 1708 }, { @@ -49547,27 +49547,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 844.6953125, - "completions/mean_terminated_length": 803.3697509765625, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2012.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 694.37890625, + "completions/mean_terminated_length": 694.37890625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.5834257915848767, - "grad_norm": 1.4315153360366821, - "kl": 5.24609375, - "learning_rate": 4.978469368380787e-07, - "loss": 0.2995, - "num_tokens": 949188458.0, - "reward": 1.79443359375, - "reward_std": 0.539413332939148, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20468609035015106, + "grad_norm": 5.183916091918945, + "kl": 2.78515625, + "learning_rate": 4.980935975769303e-07, + "loss": 0.1157, + "num_tokens": 989454823.0, + "reward": 1.005859375, + "reward_std": 0.22932368516921997, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.16025325655937195, "step": 1709 }, { @@ -49576,27 +49576,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 808.55859375, - "completions/mean_terminated_length": 768.5765991210938, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_terminated_length": 1880.0, + "completions/mean_length": 683.53125, + "completions/mean_terminated_length": 680.8610229492188, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.5837671758982674, - "grad_norm": 2.2452232837677, - "kl": 4.375, - "learning_rate": 4.97314478636338e-07, - "loss": 0.2668, - "num_tokens": 949679192.0, - "reward": 1.9013671875, - "reward_std": 0.5118180513381958, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.1779806911945343, + "grad_norm": 1.4123501777648926, + "kl": 3.05859375, + "learning_rate": 4.975609030004938e-07, + "loss": 0.1359, + "num_tokens": 989881543.0, + "reward": 1.064453125, + "reward_std": 0.2676643133163452, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.17942707240581512, "step": 1710 }, { @@ -49605,27 +49605,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 793.44921875, - "completions/mean_terminated_length": 752.9797973632812, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 680.91796875, + "completions/mean_terminated_length": 678.24267578125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, "epoch": 0.5841085602116582, - "grad_norm": 3.081019163131714, - "kl": 4.6953125, - "learning_rate": 4.967820952120588e-07, - "loss": 0.3256, - "num_tokens": 950162430.0, - "reward": 1.85595703125, - "reward_std": 0.4917498230934143, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.18154938519001007, + "grad_norm": 6.666347026824951, + "kl": 2.64453125, + "learning_rate": 4.97028282908245e-07, + "loss": 0.2017, + "num_tokens": 990307165.0, + "reward": 1.0205078125, + "reward_std": 0.24870701134204865, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.16440613567829132, "step": 1711 }, { @@ -49634,27 +49634,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 854.888671875, - "completions/mean_terminated_length": 811.4149780273438, - "completions/min_length": 36.0, - "completions/min_terminated_length": 36.0, + "completions/max_terminated_length": 1882.0, + "completions/mean_length": 708.55859375, + "completions/mean_terminated_length": 705.9373779296875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.5844499445250491, - "grad_norm": 2.012058973312378, - "kl": 5.9453125, - "learning_rate": 4.962497873208616e-07, - "loss": 0.4021, - "num_tokens": 950670741.0, - "reward": 1.845703125, - "reward_std": 0.5269229412078857, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.2059757262468338, + "grad_norm": 1.9692797660827637, + "kl": 2.984375, + "learning_rate": 4.964957380567146e-07, + "loss": 0.1781, + "num_tokens": 990740555.0, + "reward": 1.02880859375, + "reward_std": 0.2653002142906189, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.19021636247634888, "step": 1712 }, { @@ -49663,27 +49663,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 841.134765625, - "completions/mean_terminated_length": 792.0751953125, - "completions/min_length": 55.0, - "completions/min_terminated_length": 55.0, + "completions/max_terminated_length": 1934.0, + "completions/mean_length": 679.177734375, + "completions/mean_terminated_length": 673.809814453125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, "epoch": 0.5847913288384399, - "grad_norm": 0.938133716583252, - "kl": 6.279296875, - "learning_rate": 4.957175557182601e-07, - "loss": 0.3664, - "num_tokens": 951185706.0, - "reward": 1.8212890625, - "reward_std": 0.5399194955825806, - "rewards/accuracy_reward/mean": 0.0786290317773819, - "rewards/accuracy_reward/std": 0.26943066716194153, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.1972721517086029, + "grad_norm": 4.167803764343262, + "kl": 3.28125, + "learning_rate": 4.959632692023262e-07, + "loss": 0.2214, + "num_tokens": 991172598.0, + "reward": 1.06884765625, + "reward_std": 0.29546090960502625, + "rewards/accuracy_reward/mean": 0.12903225421905518, + "rewards/accuracy_reward/std": 0.33557409048080444, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.1847475916147232, "step": 1713 }, { @@ -49692,27 +49692,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1880.0, - "completions/mean_length": 802.322265625, - "completions/mean_terminated_length": 764.726318359375, - "completions/min_length": 201.0, - "completions/min_terminated_length": 201.0, + "completions/max_terminated_length": 1511.0, + "completions/mean_length": 690.537109375, + "completions/mean_terminated_length": 679.8484497070312, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.5851327131518307, - "grad_norm": 0.9956303238868713, - "kl": 6.3125, - "learning_rate": 4.9518540115966e-07, - "loss": 0.4025, - "num_tokens": 951675007.0, - "reward": 1.91455078125, - "reward_std": 0.5179139375686646, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.94384765625, - "rewards/tag_count_reward/std": 0.16814936697483063, + "grad_norm": 4.968861103057861, + "kl": 2.708984375, + "learning_rate": 4.954308771013954e-07, + "loss": 0.1861, + "num_tokens": 991604665.0, + "reward": 1.13623046875, + "reward_std": 0.29561829566955566, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.94677734375, + "rewards/tag_count_reward/std": 0.15710307657718658, "step": 1714 }, { @@ -49721,27 +49721,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1898.0, - "completions/mean_length": 822.064453125, - "completions/mean_terminated_length": 790.1262817382812, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2042.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 659.2890625, + "completions/mean_terminated_length": 659.2890625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.5854740974652215, - "grad_norm": 1.209726333618164, - "kl": 4.58203125, - "learning_rate": 4.946533244003572e-07, - "loss": 0.2774, - "num_tokens": 952168288.0, - "reward": 1.87060546875, - "reward_std": 0.4697340428829193, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.17360158264636993, + "grad_norm": 3.05635666847229, + "kl": 1.857421875, + "learning_rate": 4.948985625101287e-07, + "loss": 0.1103, + "num_tokens": 992014605.0, + "reward": 1.02783203125, + "reward_std": 0.22893986105918884, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.95556640625, + "rewards/tag_count_reward/std": 0.14284388720989227, "step": 1715 }, { @@ -49750,27 +49750,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 874.486328125, - "completions/mean_terminated_length": 851.1095581054688, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/max_terminated_length": 1804.0, + "completions/mean_length": 744.759765625, + "completions/mean_terminated_length": 742.2094116210938, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, "epoch": 0.5858154817786123, - "grad_norm": 0.7931808233261108, - "kl": 5.109375, - "learning_rate": 4.941213261955374e-07, - "loss": 0.3082, - "num_tokens": 952691609.0, - "reward": 1.89208984375, - "reward_std": 0.46906012296676636, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.94677734375, - "rewards/tag_count_reward/std": 0.16910135746002197, + "grad_norm": 3.31899356842041, + "kl": 2.75390625, + "learning_rate": 4.943663261846227e-07, + "loss": 0.1712, + "num_tokens": 992471506.0, + "reward": 1.01513671875, + "reward_std": 0.24060595035552979, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.92919921875, + "rewards/tag_count_reward/std": 0.17761725187301636, "step": 1716 }, { @@ -49779,27 +49779,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 884.333984375, - "completions/mean_terminated_length": 834.564208984375, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1824.0, + "completions/max_terminated_length": 1824.0, + "completions/mean_length": 692.4765625, + "completions/mean_terminated_length": 692.4765625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, "epoch": 0.5861568660920031, - "grad_norm": 3.3196003437042236, - "kl": 6.7265625, - "learning_rate": 4.935894073002749e-07, - "loss": 0.4011, - "num_tokens": 953222356.0, - "reward": 1.890625, - "reward_std": 0.5219039916992188, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.19562077522277832, + "grad_norm": 3.3546416759490967, + "kl": 2.349609375, + "learning_rate": 4.938341688808628e-07, + "loss": 0.0916, + "num_tokens": 992904022.0, + "reward": 1.11083984375, + "reward_std": 0.32329094409942627, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, + "rewards/format_reward/mean": 0.021484375, + "rewards/format_reward/std": 0.14513419568538666, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.17149166762828827, "step": 1717 }, { @@ -49808,27 +49808,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 835.525390625, - "completions/mean_terminated_length": 788.797119140625, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1929.0, + "completions/max_terminated_length": 1929.0, + "completions/mean_length": 675.7734375, + "completions/mean_terminated_length": 675.7734375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, "epoch": 0.5864982504053938, - "grad_norm": 1.0607223510742188, - "kl": 6.09375, - "learning_rate": 4.930575684695309e-07, - "loss": 0.3787, - "num_tokens": 953724689.0, - "reward": 1.92138671875, - "reward_std": 0.4783501625061035, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.17846523225307465, + "grad_norm": 2.7836081981658936, + "kl": 2.890625, + "learning_rate": 4.933020913547223e-07, + "loss": 0.1658, + "num_tokens": 993324562.0, + "reward": 1.0576171875, + "reward_std": 0.24416905641555786, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.16050052642822266, "step": 1718 }, { @@ -49837,27 +49837,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 891.671875, - "completions/mean_terminated_length": 849.5385131835938, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1853.0, + "completions/max_terminated_length": 1853.0, + "completions/mean_length": 707.736328125, + "completions/mean_terminated_length": 707.736328125, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, "epoch": 0.5868396347187846, - "grad_norm": 1.6661996841430664, - "kl": 5.8203125, - "learning_rate": 4.925258104581534e-07, - "loss": 0.348, - "num_tokens": 954255721.0, - "reward": 1.87744140625, - "reward_std": 0.5170288681983948, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.1941147744655609, + "grad_norm": 2.291926145553589, + "kl": 2.9765625, + "learning_rate": 4.927700943619609e-07, + "loss": 0.1772, + "num_tokens": 993761419.0, + "reward": 1.04296875, + "reward_std": 0.2785525918006897, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.19499453902244568, "step": 1719 }, { @@ -49866,27 +49866,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 801.8515625, - "completions/mean_terminated_length": 779.5546264648438, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1883.0, + "completions/max_terminated_length": 1883.0, + "completions/mean_length": 628.4296875, + "completions/mean_terminated_length": 628.4296875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, "epoch": 0.5871810190321755, - "grad_norm": 1.2811676263809204, - "kl": 4.98046875, - "learning_rate": 4.91994134020876e-07, - "loss": 0.2976, - "num_tokens": 954739149.0, - "reward": 1.86865234375, - "reward_std": 0.46206235885620117, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.1572064310312271, + "grad_norm": 2.281153678894043, + "kl": 2.37109375, + "learning_rate": 4.922381786582241e-07, + "loss": 0.133, + "num_tokens": 994156055.0, + "reward": 1.06591796875, + "reward_std": 0.25389882922172546, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.95654296875, + "rewards/tag_count_reward/std": 0.14142537117004395, "step": 1720 }, { @@ -49895,27 +49895,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1903.0, - "completions/mean_length": 826.0234375, - "completions/mean_terminated_length": 806.6270141601562, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 1748.0, + "completions/mean_length": 683.44921875, + "completions/mean_terminated_length": 680.7788696289062, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, "epoch": 0.5875224033455663, - "grad_norm": 1.8048064708709717, - "kl": 3.62890625, - "learning_rate": 4.914625399123159e-07, - "loss": 0.1979, - "num_tokens": 955234265.0, - "reward": 1.93994140625, - "reward_std": 0.48053503036499023, - "rewards/accuracy_reward/mean": 0.10080645233392715, - "rewards/accuracy_reward/std": 0.30137622356414795, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.94970703125, - "rewards/tag_count_reward/std": 0.1572915017604828, + "grad_norm": 2.4694714546203613, + "kl": 2.57421875, + "learning_rate": 4.917063449990416e-07, + "loss": 0.1405, + "num_tokens": 994578173.0, + "reward": 1.07763671875, + "reward_std": 0.2936060428619385, + "rewards/accuracy_reward/mean": 0.13306452333927155, + "rewards/accuracy_reward/std": 0.3399873375892639, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.17912793159484863, "step": 1721 }, { @@ -49924,27 +49924,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 867.78125, - "completions/mean_terminated_length": 794.3236694335938, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 679.0, + "completions/mean_terminated_length": 668.220458984375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.5878637876589571, - "grad_norm": 1.2541394233703613, - "kl": 6.0078125, - "learning_rate": 4.909310288869737e-07, - "loss": 0.3846, - "num_tokens": 955757769.0, - "reward": 1.8447265625, - "reward_std": 0.4987034499645233, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.20051981508731842, + "grad_norm": 140.36610412597656, + "kl": 4.19921875, + "learning_rate": 4.91174594139827e-07, + "loss": 0.2362, + "num_tokens": 995005021.0, + "reward": 1.048828125, + "reward_std": 0.2565409243106842, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.935546875, + "rewards/tag_count_reward/std": 0.16843964159488678, "step": 1722 }, { @@ -49953,27 +49953,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 888.212890625, - "completions/mean_terminated_length": 845.9534912109375, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 707.935546875, + "completions/mean_terminated_length": 702.680419921875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, "epoch": 0.5882051719723479, - "grad_norm": 0.8701521754264832, - "kl": 5.796875, - "learning_rate": 4.903996016992323e-07, - "loss": 0.3725, - "num_tokens": 956299974.0, - "reward": 1.83642578125, - "reward_std": 0.5082464218139648, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.18705546855926514, + "grad_norm": 2.950345516204834, + "kl": 3.0078125, + "learning_rate": 4.906429268358762e-07, + "loss": 0.2013, + "num_tokens": 995454924.0, + "reward": 1.04248046875, + "reward_std": 0.25885578989982605, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.17292876541614532, "step": 1723 }, { @@ -49982,27 +49982,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1948.0, - "completions/mean_length": 873.1484375, - "completions/mean_terminated_length": 820.3999633789062, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 1922.0, + "completions/mean_length": 682.8515625, + "completions/mean_terminated_length": 677.4981079101562, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.5885465562857387, - "grad_norm": 1.3485100269317627, - "kl": 5.02734375, - "learning_rate": 4.898682591033551e-07, - "loss": 0.3417, - "num_tokens": 956830402.0, - "reward": 1.81201171875, - "reward_std": 0.4673128128051758, - "rewards/accuracy_reward/mean": 0.021484375, - "rewards/accuracy_reward/std": 0.14513419568538666, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.1927117109298706, + "grad_norm": 2.735339879989624, + "kl": 3.26953125, + "learning_rate": 4.901113438423664e-07, + "loss": 0.1833, + "num_tokens": 995887920.0, + "reward": 0.96533203125, + "reward_std": 0.2332734763622284, + "rewards/accuracy_reward/mean": 0.02734375, + "rewards/accuracy_reward/std": 0.16324250400066376, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.92236328125, + "rewards/tag_count_reward/std": 0.18756051361560822, "step": 1724 }, { @@ -50011,27 +50011,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 920.4375, - "completions/mean_terminated_length": 869.8121948242188, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1627.0, + "completions/max_terminated_length": 1627.0, + "completions/mean_length": 682.0859375, + "completions/mean_terminated_length": 682.0859375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.5888879405991295, - "grad_norm": 2.1556503772735596, - "kl": 6.1875, - "learning_rate": 4.893370018534858e-07, - "loss": 0.3534, - "num_tokens": 957372210.0, - "reward": 1.75732421875, - "reward_std": 0.5546750426292419, - "rewards/accuracy_reward/mean": 0.02016128972172737, - "rewards/accuracy_reward/std": 0.14069372415542603, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.20635519921779633, + "grad_norm": 3.4718093872070312, + "kl": 3.1015625, + "learning_rate": 4.895798459143548e-07, + "loss": 0.1544, + "num_tokens": 996307692.0, + "reward": 0.962890625, + "reward_std": 0.2180371880531311, + "rewards/accuracy_reward/mean": 0.026209676638245583, + "rewards/accuracy_reward/std": 0.1599196344614029, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.18468615412712097, "step": 1725 }, { @@ -50040,27 +50040,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 928.28515625, - "completions/mean_terminated_length": 865.9505615234375, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/max_terminated_length": 1890.0, + "completions/mean_length": 696.72265625, + "completions/mean_terminated_length": 694.0782470703125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, "epoch": 0.5892293249125202, - "grad_norm": 1.8250174522399902, - "kl": 6.9453125, - "learning_rate": 4.888058307036468e-07, - "loss": 0.4175, - "num_tokens": 957924964.0, - "reward": 1.822265625, - "reward_std": 0.5986999273300171, - "rewards/accuracy_reward/mean": 0.1088709682226181, - "rewards/accuracy_reward/std": 0.31179171800613403, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.904296875, - "rewards/tag_count_reward/std": 0.2118576318025589, + "grad_norm": 2.1736018657684326, + "kl": 3.0234375, + "learning_rate": 4.890484338067781e-07, + "loss": 0.1775, + "num_tokens": 996741886.0, + "reward": 1.0595703125, + "reward_std": 0.3137122392654419, + "rewards/accuracy_reward/mean": 0.13508065044879913, + "rewards/accuracy_reward/std": 0.3421548008918762, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.18546834588050842, "step": 1726 }, { @@ -50069,27 +50069,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1777.0, - "completions/mean_length": 855.818359375, - "completions/mean_terminated_length": 797.1864013671875, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 691.771484375, + "completions/mean_terminated_length": 683.7780151367188, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, "epoch": 0.589570709225911, - "grad_norm": 2.7474205493927, - "kl": 6.890625, - "learning_rate": 4.882747464077388e-07, - "loss": 0.3981, - "num_tokens": 958442791.0, - "reward": 1.7666015625, - "reward_std": 0.5679394006729126, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.9033203125, - "rewards/tag_count_reward/std": 0.21141289174556732, + "grad_norm": 5.512001991271973, + "kl": 3.154296875, + "learning_rate": 4.885171082744506e-07, + "loss": 0.1753, + "num_tokens": 997175721.0, + "reward": 1.03173828125, + "reward_std": 0.2452293187379837, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.16898830235004425, "step": 1727 }, { @@ -50098,27 +50098,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 798.84375, - "completions/mean_terminated_length": 753.3279418945312, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 1866.0, + "completions/mean_length": 642.1953125, + "completions/mean_terminated_length": 639.4442138671875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.5899120935393019, - "grad_norm": 1.4245671033859253, - "kl": 5.1796875, - "learning_rate": 4.877437497195385e-07, - "loss": 0.3268, - "num_tokens": 958920935.0, - "reward": 1.80859375, - "reward_std": 0.5384195446968079, - "rewards/accuracy_reward/mean": 0.0463709682226181, - "rewards/accuracy_reward/std": 0.21049949526786804, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.1931036412715912, + "grad_norm": 3.7306766510009766, + "kl": 3.03125, + "learning_rate": 4.879858700720645e-07, + "loss": 0.2189, + "num_tokens": 997573661.0, + "reward": 1.02490234375, + "reward_std": 0.23586505651474, + "rewards/accuracy_reward/mean": 0.07661290466785431, + "rewards/accuracy_reward/std": 0.2662447690963745, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.1588330715894699, "step": 1728 }, { @@ -50127,27 +50127,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 838.607421875, - "completions/mean_terminated_length": 784.30810546875, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 1856.0, + "completions/mean_length": 682.791015625, + "completions/mean_terminated_length": 666.602783203125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.5902534778526927, - "grad_norm": 2.010416030883789, - "kl": 4.71875, - "learning_rate": 4.872128413926989e-07, - "loss": 0.3042, - "num_tokens": 959439294.0, - "reward": 1.80419921875, - "reward_std": 0.5523759126663208, - "rewards/accuracy_reward/mean": 0.06653226166963577, - "rewards/accuracy_reward/std": 0.24946178495883942, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.18677429854869843, + "grad_norm": 2.172267198562622, + "kl": 3.90234375, + "learning_rate": 4.874547199541871e-07, + "loss": 0.2491, + "num_tokens": 998012242.0, + "reward": 1.05810546875, + "reward_std": 0.29296067357063293, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310528099536896, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.1923096626996994, "step": 1729 }, { @@ -50156,27 +50156,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1963.0, - "completions/mean_length": 782.833984375, - "completions/mean_terminated_length": 747.2670288085938, - "completions/min_length": 50.0, - "completions/min_terminated_length": 50.0, + "completions/max_terminated_length": 1800.0, + "completions/mean_length": 675.595703125, + "completions/mean_terminated_length": 670.2137451171875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, "epoch": 0.5905948621660835, - "grad_norm": 2.0043551921844482, - "kl": 4.84765625, - "learning_rate": 4.866820221807467e-07, - "loss": 0.2975, - "num_tokens": 959918617.0, - "reward": 1.8447265625, - "reward_std": 0.5187762379646301, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.18374989926815033, + "grad_norm": 3.394458770751953, + "kl": 2.349609375, + "learning_rate": 4.869236586752612e-07, + "loss": 0.1408, + "num_tokens": 998436659.0, + "reward": 1.037109375, + "reward_std": 0.20615383982658386, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.14162877202033997, "step": 1730 }, { @@ -50185,27 +50185,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1909.0, - "completions/mean_length": 856.052734375, - "completions/mean_terminated_length": 817.602783203125, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 1950.0, + "completions/mean_length": 736.423828125, + "completions/mean_terminated_length": 718.2435913085938, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.5909362464794743, - "grad_norm": 1.4563902616500854, - "kl": 3.94140625, - "learning_rate": 4.861512928370831e-07, - "loss": 0.2507, - "num_tokens": 960432660.0, - "reward": 1.86376953125, - "reward_std": 0.49892061948776245, - "rewards/accuracy_reward/mean": 0.08669354766607285, - "rewards/accuracy_reward/std": 0.281669557094574, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.17514143884181976, + "grad_norm": 4.250203609466553, + "kl": 3.189453125, + "learning_rate": 4.863926869896029e-07, + "loss": 0.219, + "num_tokens": 998889452.0, + "reward": 1.0283203125, + "reward_std": 0.2300959974527359, + "rewards/accuracy_reward/mean": 0.0927419364452362, + "rewards/accuracy_reward/std": 0.2903633117675781, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.17776581645011902, "step": 1731 }, { @@ -50214,27 +50214,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 857.337890625, - "completions/mean_terminated_length": 806.4134521484375, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 1895.0, + "completions/mean_length": 684.0390625, + "completions/mean_terminated_length": 681.369873046875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.5912776307928651, - "grad_norm": 2.9518094062805176, - "kl": 5.4375, - "learning_rate": 4.856206541149812e-07, - "loss": 0.3806, - "num_tokens": 960951377.0, - "reward": 1.7919921875, - "reward_std": 0.5659606456756592, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.20643681287765503, + "grad_norm": 3.9215407371520996, + "kl": 2.921875, + "learning_rate": 4.858618056514016e-07, + "loss": 0.1392, + "num_tokens": 999319440.0, + "reward": 1.08837890625, + "reward_std": 0.2949296236038208, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.17115144431591034, "step": 1732 }, { @@ -50243,27 +50243,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 781.751953125, - "completions/mean_terminated_length": 746.1546020507812, - "completions/min_length": 191.0, - "completions/min_terminated_length": 191.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 728.724609375, + "completions/mean_terminated_length": 718.3366088867188, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.5916190151062559, - "grad_norm": 2.4586381912231445, - "kl": 4.390625, - "learning_rate": 4.850901067675855e-07, - "loss": 0.292, - "num_tokens": 961427042.0, - "reward": 1.86865234375, - "reward_std": 0.4505774974822998, - "rewards/accuracy_reward/mean": 0.038306452333927155, - "rewards/accuracy_reward/std": 0.19212883710861206, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.17063702642917633, + "grad_norm": 2.9033477306365967, + "kl": 3.57421875, + "learning_rate": 4.853310154147176e-07, + "loss": 0.1964, + "num_tokens": 999767955.0, + "reward": 0.984375, + "reward_std": 0.22362488508224487, + "rewards/accuracy_reward/mean": 0.04233871027827263, + "rewards/accuracy_reward/std": 0.2015640139579773, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.1719430834054947, "step": 1733 }, { @@ -50272,27 +50272,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1864.0, - "completions/mean_length": 846.513671875, - "completions/mean_terminated_length": 800.2089233398438, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1966.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 693.205078125, + "completions/mean_terminated_length": 693.205078125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.5919603994196466, - "grad_norm": 2.042649507522583, - "kl": 5.4140625, - "learning_rate": 4.845596515479113e-07, - "loss": 0.359, - "num_tokens": 961939561.0, - "reward": 1.826171875, - "reward_std": 0.49255073070526123, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.18100711703300476, + "grad_norm": 4.697007179260254, + "kl": 2.73046875, + "learning_rate": 4.848003170334826e-07, + "loss": 0.17, + "num_tokens": 1000201980.0, + "reward": 1.03662109375, + "reward_std": 0.21143919229507446, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.95068359375, + "rewards/tag_count_reward/std": 0.14633327722549438, "step": 1734 }, { @@ -50301,27 +50301,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1920.0, - "completions/mean_length": 778.263671875, - "completions/mean_terminated_length": 729.32861328125, - "completions/min_length": 17.0, - "completions/min_terminated_length": 17.0, + "completions/max_terminated_length": 1748.0, + "completions/mean_length": 681.6875, + "completions/mean_terminated_length": 676.3294677734375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.5923017837330374, - "grad_norm": 1.1241272687911987, - "kl": 5.40625, - "learning_rate": 4.840292892088423e-07, - "loss": 0.3483, - "num_tokens": 962409280.0, - "reward": 1.8349609375, - "reward_std": 0.5005354881286621, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.9306640625, - "rewards/tag_count_reward/std": 0.19175048172473907, + "grad_norm": 4.053927898406982, + "kl": 3.00390625, + "learning_rate": 4.842697112614972e-07, + "loss": 0.2205, + "num_tokens": 1000622252.0, + "reward": 1.00634765625, + "reward_std": 0.22415204346179962, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.94384765625, + "rewards/tag_count_reward/std": 0.1666882485151291, "step": 1735 }, { @@ -50330,27 +50330,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 864.265625, - "completions/mean_terminated_length": 826.0806274414062, - "completions/min_length": 50.0, - "completions/min_terminated_length": 50.0, + "completions/max_terminated_length": 1723.0, + "completions/mean_length": 724.05078125, + "completions/mean_terminated_length": 721.4598999023438, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, "epoch": 0.5926431680464282, - "grad_norm": 2.7050602436065674, - "kl": 6.8671875, - "learning_rate": 4.834990205031314e-07, - "loss": 0.424, - "num_tokens": 962925720.0, - "reward": 1.84765625, - "reward_std": 0.5242334604263306, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.19767135381698608, + "grad_norm": 1.5162237882614136, + "kl": 2.455078125, + "learning_rate": 4.837391988524313e-07, + "loss": 0.1733, + "num_tokens": 1001066902.0, + "reward": 1.015625, + "reward_std": 0.2540043294429779, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.17690637707710266, "step": 1736 }, { @@ -50359,27 +50359,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 806.775390625, - "completions/mean_terminated_length": 761.548583984375, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 1921.0, + "completions/mean_length": 707.630859375, + "completions/mean_terminated_length": 705.0078125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.5929845523598191, - "grad_norm": 1.4915874004364014, - "kl": 6.6875, - "learning_rate": 4.829688461833975e-07, - "loss": 0.4136, - "num_tokens": 963409205.0, - "reward": 1.8388671875, - "reward_std": 0.5788981914520264, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.21240492165088654, + "grad_norm": 3.6893367767333984, + "kl": 2.74609375, + "learning_rate": 4.83208780559821e-07, + "loss": 0.1906, + "num_tokens": 1001499625.0, + "reward": 1.0341796875, + "reward_std": 0.2518423795700073, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.17378656566143036, "step": 1737 }, { @@ -50388,27 +50388,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 789.794921875, - "completions/mean_terminated_length": 741.3042602539062, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 712.94140625, + "completions/mean_terminated_length": 705.0726928710938, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.5933259366732099, - "grad_norm": 1.7969499826431274, - "kl": 6.6875, - "learning_rate": 4.824387670021263e-07, - "loss": 0.4079, - "num_tokens": 963888348.0, - "reward": 1.88232421875, - "reward_std": 0.504429042339325, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.19224505126476288, + "grad_norm": 3.1321589946746826, + "kl": 2.28125, + "learning_rate": 4.826784571370698e-07, + "loss": 0.1501, + "num_tokens": 1001939419.0, + "reward": 1.06396484375, + "reward_std": 0.24559524655342102, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.94873046875, + "rewards/tag_count_reward/std": 0.15382707118988037, "step": 1738 }, { @@ -50417,27 +50417,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 800.53125, - "completions/mean_terminated_length": 765.4617919921875, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 1792.0, + "completions/mean_length": 691.447265625, + "completions/mean_terminated_length": 688.7925415039062, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.5936673209866007, - "grad_norm": 1.737382173538208, - "kl": 6.515625, - "learning_rate": 4.819087837116682e-07, - "loss": 0.3886, - "num_tokens": 964372828.0, - "reward": 1.83642578125, - "reward_std": 0.5354998707771301, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.1941147744655609, + "grad_norm": 2.446129322052002, + "kl": 1.958984375, + "learning_rate": 4.821482293374457e-07, + "loss": 0.1254, + "num_tokens": 1002368048.0, + "reward": 1.04833984375, + "reward_std": 0.2228967845439911, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.95654296875, + "rewards/tag_count_reward/std": 0.14142537117004395, "step": 1739 }, { @@ -50446,27 +50446,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 753.48046875, - "completions/mean_terminated_length": 709.0222778320312, - "completions/min_length": 94.0, - "completions/min_terminated_length": 94.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 666.48046875, + "completions/mean_terminated_length": 661.0628051757812, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, "epoch": 0.5940087052999915, - "grad_norm": 3.20028018951416, - "kl": 7.2890625, - "learning_rate": 4.813788970642373e-07, - "loss": 0.4455, - "num_tokens": 964836610.0, - "reward": 1.8701171875, - "reward_std": 0.5288277864456177, - "rewards/accuracy_reward/mean": 0.07661290466785431, - "rewards/accuracy_reward/std": 0.2662447690963745, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.20080552995204926, + "grad_norm": 2.8640072345733643, + "kl": 2.33203125, + "learning_rate": 4.816180979140815e-07, + "loss": 0.1512, + "num_tokens": 1002787286.0, + "reward": 1.09033203125, + "reward_std": 0.2300066202878952, + "rewards/accuracy_reward/mean": 0.12298387289047241, + "rewards/accuracy_reward/std": 0.32875028252601624, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.96337890625, + "rewards/tag_count_reward/std": 0.13086237013339996, "step": 1740 }, { @@ -50475,27 +50475,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 794.53125, - "completions/mean_terminated_length": 756.7001953125, - "completions/min_length": 93.0, - "completions/min_terminated_length": 93.0, + "completions/max_terminated_length": 1630.0, + "completions/mean_length": 676.17578125, + "completions/mean_terminated_length": 668.0903930664062, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, "epoch": 0.5943500896133823, - "grad_norm": 3.0313708782196045, - "kl": 7.06640625, - "learning_rate": 4.808491078119114e-07, - "loss": 0.4024, - "num_tokens": 965320258.0, - "reward": 1.86328125, - "reward_std": 0.49829375743865967, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.18937622010707855, + "grad_norm": 2.2872042655944824, + "kl": 3.1171875, + "learning_rate": 4.810880636199724e-07, + "loss": 0.1959, + "num_tokens": 1003210336.0, + "reward": 1.07861328125, + "reward_std": 0.28277212381362915, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.94970703125, + "rewards/tag_count_reward/std": 0.15960724651813507, "step": 1741 }, { @@ -50504,27 +50504,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 862.416015625, - "completions/mean_terminated_length": 811.7088012695312, - "completions/min_length": 89.0, - "completions/min_terminated_length": 89.0, + "completions/max_terminated_length": 1944.0, + "completions/mean_length": 779.900390625, + "completions/mean_terminated_length": 772.4263305664062, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.594691473926773, - "grad_norm": 1.8369535207748413, - "kl": 7.1328125, - "learning_rate": 4.803194167066292e-07, - "loss": 0.4815, - "num_tokens": 965840903.0, - "reward": 1.8203125, - "reward_std": 0.5239598751068115, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.18817149102687836, + "grad_norm": 6.595098972320557, + "kl": 4.22265625, + "learning_rate": 4.805581272079764e-07, + "loss": 0.2315, + "num_tokens": 1003688733.0, + "reward": 0.99853515625, + "reward_std": 0.258689820766449, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.17453479766845703, "step": 1742 }, { @@ -50533,27 +50533,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 802.103515625, - "completions/mean_terminated_length": 774.7484741210938, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1675.0, + "completions/max_terminated_length": 1675.0, + "completions/mean_length": 696.111328125, + "completions/mean_terminated_length": 696.111328125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.5950328582401638, - "grad_norm": 2.128993034362793, - "kl": 6.3984375, - "learning_rate": 4.7978982450019e-07, - "loss": 0.3625, - "num_tokens": 966318460.0, - "reward": 1.8330078125, - "reward_std": 0.5399568676948547, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.18822988867759705, + "grad_norm": 1.806591510772705, + "kl": 2.623046875, + "learning_rate": 4.800282894308116e-07, + "loss": 0.1643, + "num_tokens": 1004112022.0, + "reward": 1.09375, + "reward_std": 0.26406821608543396, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.001953125, + "rewards/format_reward/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.955078125, + "rewards/tag_count_reward/std": 0.15382784605026245, "step": 1743 }, { @@ -50562,27 +50562,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 804.224609375, - "completions/mean_terminated_length": 779.4482421875, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1631.0, + "completions/max_terminated_length": 1631.0, + "completions/mean_length": 682.31640625, + "completions/mean_terminated_length": 682.31640625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, "epoch": 0.5953742425535546, - "grad_norm": 2.5650181770324707, - "kl": 4.31640625, - "learning_rate": 4.792603319442533e-07, - "loss": 0.2638, - "num_tokens": 966807199.0, - "reward": 1.83935546875, - "reward_std": 0.4204694628715515, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.1678706705570221, + "grad_norm": 2.5042014122009277, + "kl": 2.193359375, + "learning_rate": 4.794985510410569e-07, + "loss": 0.1227, + "num_tokens": 1004538344.0, + "reward": 1.01025390625, + "reward_std": 0.18221446871757507, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.95361328125, + "rewards/tag_count_reward/std": 0.14561976492404938, "step": 1744 }, { @@ -50591,27 +50591,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1974.0, - "completions/mean_length": 825.75390625, - "completions/mean_terminated_length": 796.4200439453125, - "completions/min_length": 193.0, - "completions/min_terminated_length": 193.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 770.05078125, + "completions/mean_terminated_length": 757.44775390625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.5957156268669455, - "grad_norm": 2.448404312133789, - "kl": 4.71875, - "learning_rate": 4.78730939790337e-07, - "loss": 0.2983, - "num_tokens": 967309873.0, - "reward": 1.88525390625, - "reward_std": 0.4958968460559845, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.93798828125, - "rewards/tag_count_reward/std": 0.17677602171897888, + "grad_norm": 3.8201491832733154, + "kl": 4.00390625, + "learning_rate": 4.789689127911498e-07, + "loss": 0.2523, + "num_tokens": 1005012498.0, + "reward": 1.015625, + "reward_std": 0.30323147773742676, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.17694957554340363, "step": 1745 }, { @@ -50620,27 +50620,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 883.736328125, - "completions/mean_terminated_length": 843.7515258789062, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 801.185546875, + "completions/mean_terminated_length": 798.74560546875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.5960570111803363, - "grad_norm": 2.7602598667144775, - "kl": 5.578125, - "learning_rate": 4.782016487898163e-07, - "loss": 0.3717, - "num_tokens": 967838458.0, - "reward": 1.8173828125, - "reward_std": 0.5011401772499084, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.18835169076919556, + "grad_norm": 3.829974889755249, + "kl": 3.123046875, + "learning_rate": 4.784393754333849e-07, + "loss": 0.1536, + "num_tokens": 1005498817.0, + "reward": 1.02001953125, + "reward_std": 0.2365279495716095, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.94384765625, + "rewards/tag_count_reward/std": 0.15994812548160553, "step": 1746 }, { @@ -50649,27 +50649,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 852.3359375, - "completions/mean_terminated_length": 823.6400146484375, - "completions/min_length": 211.0, - "completions/min_terminated_length": 211.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 741.919921875, + "completions/mean_terminated_length": 739.364013671875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, "epoch": 0.5963983954937271, - "grad_norm": 1.6309069395065308, - "kl": 4.685546875, - "learning_rate": 4.77672459693923e-07, - "loss": 0.308, - "num_tokens": 968354118.0, - "reward": 1.91455078125, - "reward_std": 0.46543601155281067, - "rewards/accuracy_reward/mean": 0.08467742055654526, - "rewards/accuracy_reward/std": 0.278682142496109, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.1739315241575241, + "grad_norm": 6.226439476013184, + "kl": 3.015625, + "learning_rate": 4.779099397199142e-07, + "loss": 0.1408, + "num_tokens": 1005957944.0, + "reward": 1.0556640625, + "reward_std": 0.27483633160591125, + "rewards/accuracy_reward/mean": 0.10282257944345474, + "rewards/accuracy_reward/std": 0.30403366684913635, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.1598801612854004, "step": 1747 }, { @@ -50678,27 +50678,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1915.0, - "completions/mean_length": 782.896484375, - "completions/mean_terminated_length": 755.1197509765625, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 1760.0, + "completions/mean_length": 705.189453125, + "completions/mean_terminated_length": 702.5616455078125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.5967397798071179, - "grad_norm": 1.6585683822631836, - "kl": 4.765625, - "learning_rate": 4.771433732537446e-07, - "loss": 0.3026, - "num_tokens": 968830913.0, - "reward": 1.88818359375, - "reward_std": 0.4815041720867157, - "rewards/accuracy_reward/mean": 0.08266129344701767, - "rewards/accuracy_reward/std": 0.2756475806236267, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.1909683793783188, + "grad_norm": 2.1409924030303955, + "kl": 2.63671875, + "learning_rate": 4.77380606402745e-07, + "loss": 0.1517, + "num_tokens": 1006394953.0, + "reward": 1.1142578125, + "reward_std": 0.25171735882759094, + "rewards/accuracy_reward/mean": 0.15927419066429138, + "rewards/accuracy_reward/std": 0.366301029920578, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9482421875, + "rewards/tag_count_reward/std": 0.15406061708927155, "step": 1748 }, { @@ -50707,27 +50707,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 831.87109375, - "completions/mean_terminated_length": 756.178466796875, - "completions/min_length": 73.0, - "completions/min_terminated_length": 73.0, + "completions/max_terminated_length": 1771.0, + "completions/mean_length": 698.529296875, + "completions/mean_terminated_length": 695.888427734375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, "epoch": 0.5970811641205087, - "grad_norm": 1.4092459678649902, - "kl": 8.4375, - "learning_rate": 4.7661439022022186e-07, - "loss": 0.5519, - "num_tokens": 969329967.0, - "reward": 1.81201171875, - "reward_std": 0.527290940284729, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.19871696829795837, + "grad_norm": 3.9644875526428223, + "kl": 2.90625, + "learning_rate": 4.768513762337396e-07, + "loss": 0.1355, + "num_tokens": 1006825736.0, + "reward": 1.017578125, + "reward_std": 0.20936693251132965, + "rewards/accuracy_reward/mean": 0.058467742055654526, + "rewards/accuracy_reward/std": 0.23486268520355225, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.951171875, + "rewards/tag_count_reward/std": 0.14691409468650818, "step": 1749 }, { @@ -50736,27 +50736,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 828.638671875, - "completions/mean_terminated_length": 794.3594360351562, - "completions/min_length": 41.0, - "completions/min_terminated_length": 41.0, + "completions/max_terminated_length": 1785.0, + "completions/mean_length": 716.73046875, + "completions/mean_terminated_length": 711.5098266601562, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.5974225484338994, - "grad_norm": 1.1671388149261475, - "kl": 5.625, - "learning_rate": 4.7608551134415e-07, - "loss": 0.3178, - "num_tokens": 969827126.0, - "reward": 1.87548828125, - "reward_std": 0.545221745967865, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.18809467554092407, + "grad_norm": 2.5164918899536133, + "kl": 2.666015625, + "learning_rate": 4.763222499646129e-07, + "loss": 0.1269, + "num_tokens": 1007265598.0, + "reward": 1.0556640625, + "reward_std": 0.26374131441116333, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9462890625, + "rewards/tag_count_reward/std": 0.16267666220664978, "step": 1750 }, { @@ -50765,27 +50765,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1760.0, - "completions/mean_length": 793.0625, - "completions/mean_terminated_length": 736.7183227539062, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1766.0, + "completions/max_terminated_length": 1766.0, + "completions/mean_length": 707.34765625, + "completions/mean_terminated_length": 707.34765625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.5977639327472902, - "grad_norm": 1.3640391826629639, - "kl": 7.6171875, - "learning_rate": 4.755567373761755e-07, - "loss": 0.4561, - "num_tokens": 970305782.0, - "reward": 1.89111328125, - "reward_std": 0.5875589847564697, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.20123985409736633, + "grad_norm": 2.287261724472046, + "kl": 1.5546875, + "learning_rate": 4.757932283469334e-07, + "loss": 0.072, + "num_tokens": 1007700368.0, + "reward": 1.16943359375, + "reward_std": 0.2524051368236542, + "rewards/accuracy_reward/mean": 0.18359375, + "rewards/accuracy_reward/std": 0.3875311613082886, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.97021484375, + "rewards/tag_count_reward/std": 0.11044542491436005, "step": 1751 }, { @@ -50794,27 +50794,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1870.0, - "completions/mean_length": 852.9453125, - "completions/mean_terminated_length": 799.2897338867188, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1736.0, + "completions/max_terminated_length": 1736.0, + "completions/mean_length": 734.8671875, + "completions/mean_terminated_length": 734.8671875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, "epoch": 0.598105317060681, - "grad_norm": 2.3671305179595947, - "kl": 8.265625, - "learning_rate": 4.750280690667965e-07, - "loss": 0.4942, - "num_tokens": 970814026.0, - "reward": 1.8359375, - "reward_std": 0.5786428451538086, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.20616121590137482, + "grad_norm": 2.722154378890991, + "kl": 1.8916015625, + "learning_rate": 4.7526431213211973e-07, + "loss": 0.1033, + "num_tokens": 1008148156.0, + "reward": 1.11865234375, + "reward_std": 0.266380250453949, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.96826171875, + "rewards/tag_count_reward/std": 0.11949627846479416, "step": 1752 }, { @@ -50823,27 +50823,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 788.517578125, - "completions/mean_terminated_length": 745.2626342773438, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 1920.0, + "completions/mean_length": 721.396484375, + "completions/mean_terminated_length": 716.1941528320312, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.5984467013740719, - "grad_norm": 1.7761777639389038, - "kl": 5.46875, - "learning_rate": 4.744995071663609e-07, - "loss": 0.3671, - "num_tokens": 971298835.0, - "reward": 1.85546875, - "reward_std": 0.5084636807441711, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.19447441399097443, + "grad_norm": 3.9714505672454834, + "kl": 2.609375, + "learning_rate": 4.7473550207144174e-07, + "loss": 0.1468, + "num_tokens": 1008598599.0, + "reward": 1.05712890625, + "reward_std": 0.2545734941959381, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.94580078125, + "rewards/tag_count_reward/std": 0.15754644572734833, "step": 1753 }, { @@ -50852,27 +50852,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 809.068359375, - "completions/mean_terminated_length": 771.676025390625, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1983.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 777.484375, + "completions/mean_terminated_length": 777.484375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, "epoch": 0.5987880856874627, - "grad_norm": 2.005474090576172, - "kl": 6.1640625, - "learning_rate": 4.7397105242506576e-07, - "loss": 0.3522, - "num_tokens": 971784486.0, - "reward": 1.85498046875, - "reward_std": 0.4786653518676758, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.18679475784301758, + "grad_norm": 3.222487449645996, + "kl": 1.912109375, + "learning_rate": 4.74206798916018e-07, + "loss": 0.1269, + "num_tokens": 1009068079.0, + "reward": 1.05712890625, + "reward_std": 0.2246779501438141, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.95166015625, + "rewards/tag_count_reward/std": 0.14995817840099335, "step": 1754 }, { @@ -50881,27 +50881,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 815.5859375, - "completions/mean_terminated_length": 773.2606201171875, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 1714.0, + "completions/mean_length": 734.046875, + "completions/mean_terminated_length": 721.0887451171875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.5991294700008535, - "grad_norm": 1.6871479749679565, - "kl": 5.4296875, - "learning_rate": 4.734427055929556e-07, - "loss": 0.3421, - "num_tokens": 972284898.0, - "reward": 1.9296875, - "reward_std": 0.5219883918762207, - "rewards/accuracy_reward/mean": 0.134765625, - "rewards/accuracy_reward/std": 0.3418070077896118, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.17621363699436188, + "grad_norm": 2.475099802017212, + "kl": 2.82421875, + "learning_rate": 4.7367820341681563e-07, + "loss": 0.1863, + "num_tokens": 1009526743.0, + "reward": 1.1630859375, + "reward_std": 0.2831161320209503, + "rewards/accuracy_reward/mean": 0.193359375, + "rewards/accuracy_reward/std": 0.39531853795051575, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.9560546875, + "rewards/tag_count_reward/std": 0.14511774480342865, "step": 1755 }, { @@ -50910,27 +50910,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 918.212890625, - "completions/mean_terminated_length": 865.0736083984375, - "completions/min_length": 56.0, - "completions/min_terminated_length": 56.0, + "completions/max_terminated_length": 1806.0, + "completions/mean_length": 811.884765625, + "completions/mean_terminated_length": 804.5992431640625, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.5994708543142443, - "grad_norm": 2.2570464611053467, - "kl": 7.0859375, - "learning_rate": 4.729144674199225e-07, - "loss": 0.4195, - "num_tokens": 972835807.0, - "reward": 1.81201171875, - "reward_std": 0.5604456663131714, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.21187059581279755, + "grad_norm": 3.2170121669769287, + "kl": 2.50390625, + "learning_rate": 4.731497163246482e-07, + "loss": 0.137, + "num_tokens": 1010023212.0, + "reward": 1.0634765625, + "reward_std": 0.2751937806606293, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.9462890625, + "rewards/tag_count_reward/std": 0.15654632449150085, "step": 1756 }, { @@ -50939,27 +50939,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 812.84765625, - "completions/mean_terminated_length": 778.1244506835938, - "completions/min_length": 69.0, - "completions/min_terminated_length": 69.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 757.19921875, + "completions/mean_terminated_length": 754.6731567382812, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.5998122386276351, - "grad_norm": 1.5576038360595703, - "kl": 5.0, - "learning_rate": 4.723863386557037e-07, - "loss": 0.2982, - "num_tokens": 973336257.0, - "reward": 1.8232421875, - "reward_std": 0.5296144485473633, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.193694069981575, + "grad_norm": 1.6145522594451904, + "kl": 2.25390625, + "learning_rate": 4.7262133839017624e-07, + "loss": 0.1158, + "num_tokens": 1010495170.0, + "reward": 1.064453125, + "reward_std": 0.23234979808330536, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.14371834695339203, "step": 1757 }, { @@ -50968,27 +50968,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 863.595703125, - "completions/mean_terminated_length": 797.6598510742188, - "completions/min_length": 72.0, - "completions/min_terminated_length": 72.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 739.642578125, + "completions/mean_terminated_length": 737.0822143554688, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.6001536229410258, - "grad_norm": 1.4606642723083496, - "kl": 6.8203125, - "learning_rate": 4.7185832004988133e-07, - "loss": 0.4245, - "num_tokens": 973856546.0, - "reward": 1.79638671875, - "reward_std": 0.584352970123291, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.89990234375, - "rewards/tag_count_reward/std": 0.2197779268026352, + "grad_norm": 4.367508888244629, + "kl": 2.59375, + "learning_rate": 4.720930703639041e-07, + "loss": 0.1272, + "num_tokens": 1010951995.0, + "reward": 1.0595703125, + "reward_std": 0.29212260246276855, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.17168447375297546, "step": 1758 }, { @@ -50997,27 +50997,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 812.927734375, - "completions/mean_terminated_length": 767.9251098632812, - "completions/min_length": 33.0, - "completions/min_terminated_length": 33.0, + "completions/max_terminated_length": 1916.0, + "completions/mean_length": 725.37109375, + "completions/mean_terminated_length": 720.184326171875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.6004950072544166, - "grad_norm": 2.716848373413086, - "kl": 6.0625, - "learning_rate": 4.713304123518814e-07, - "loss": 0.3726, - "num_tokens": 974342205.0, - "reward": 1.8486328125, - "reward_std": 0.5314549803733826, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.19321990013122559, + "grad_norm": 4.212222576141357, + "kl": 2.9296875, + "learning_rate": 4.7156491299618105e-07, + "loss": 0.1705, + "num_tokens": 1011392825.0, + "reward": 1.078125, + "reward_std": 0.27235230803489685, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.16402500867843628, "step": 1759 }, { @@ -51026,27 +51026,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1929.0, - "completions/mean_length": 807.177734375, - "completions/mean_terminated_length": 754.1079711914062, - "completions/min_length": 45.0, - "completions/min_terminated_length": 45.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 774.53125, + "completions/mean_terminated_length": 759.4308471679688, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.6008363915678074, - "grad_norm": 1.1507644653320312, - "kl": 5.33203125, - "learning_rate": 4.708026163109725e-07, - "loss": 0.3146, - "num_tokens": 974835912.0, - "reward": 1.86767578125, - "reward_std": 0.6231339573860168, - "rewards/accuracy_reward/mean": 0.13671875, - "rewards/accuracy_reward/std": 0.3438861668109894, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.20320084691047668, + "grad_norm": 1.740332007408142, + "kl": 3.107421875, + "learning_rate": 4.710368670371985e-07, + "loss": 0.2019, + "num_tokens": 1011869817.0, + "reward": 1.14404296875, + "reward_std": 0.34377244114875793, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.3937928080558777, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.1845095157623291, "step": 1760 }, { @@ -51055,27 +51055,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1924.0, - "completions/mean_length": 762.54296875, - "completions/mean_terminated_length": 731.6920166015625, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/max_terminated_length": 1807.0, + "completions/mean_length": 733.302734375, + "completions/mean_terminated_length": 728.1470947265625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.6011777758811983, - "grad_norm": 1.9752110242843628, - "kl": 3.89453125, - "learning_rate": 4.7027493267626405e-07, - "loss": 0.2471, - "num_tokens": 975302254.0, - "reward": 1.9130859375, - "reward_std": 0.4799373745918274, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.1709705889225006, + "grad_norm": 4.355317115783691, + "kl": 2.74609375, + "learning_rate": 4.705089332369901e-07, + "loss": 0.1454, + "num_tokens": 1012321188.0, + "reward": 1.07177734375, + "reward_std": 0.25991693139076233, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.94482421875, + "rewards/tag_count_reward/std": 0.16104954481124878, "step": 1761 }, { @@ -51084,27 +51084,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 856.0390625, - "completions/mean_terminated_length": 815.1030883789062, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 1677.0, + "completions/mean_length": 771.171875, + "completions/mean_terminated_length": 756.0316772460938, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, "epoch": 0.6015191601945891, - "grad_norm": 1.9344838857650757, - "kl": 4.41015625, - "learning_rate": 4.69747362196707e-07, - "loss": 0.2692, - "num_tokens": 975815538.0, - "reward": 1.84375, - "reward_std": 0.5603270530700684, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.20338943600654602, + "grad_norm": 2.1956727504730225, + "kl": 2.44921875, + "learning_rate": 4.699811123454295e-07, + "loss": 0.1379, + "num_tokens": 1012791020.0, + "reward": 1.072265625, + "reward_std": 0.2925443947315216, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.16767774522304535, "step": 1762 }, { @@ -51113,27 +51113,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1928.0, - "completions/mean_length": 765.466796875, - "completions/mean_terminated_length": 739.9183349609375, - "completions/min_length": 221.0, - "completions/min_terminated_length": 221.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 729.283203125, + "completions/mean_terminated_length": 711.0039672851562, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.6018605445079799, - "grad_norm": 2.057072401046753, - "kl": 4.1875, - "learning_rate": 4.692199056210907e-07, - "loss": 0.2883, - "num_tokens": 976278129.0, - "reward": 1.89599609375, - "reward_std": 0.4938367009162903, - "rewards/accuracy_reward/mean": 0.08870967477560043, - "rewards/accuracy_reward/std": 0.284611314535141, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.18700948357582092, + "grad_norm": 3.447355270385742, + "kl": 3.71875, + "learning_rate": 4.69453405112231e-07, + "loss": 0.2491, + "num_tokens": 1013235085.0, + "reward": 1.08544921875, + "reward_std": 0.28955644369125366, + "rewards/accuracy_reward/mean": 0.13306452333927155, + "rewards/accuracy_reward/std": 0.3399873673915863, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.94482421875, + "rewards/tag_count_reward/std": 0.16405922174453735, "step": 1763 }, { @@ -51142,27 +51142,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 817.630859375, - "completions/mean_terminated_length": 780.4969482421875, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 1659.0, + "completions/mean_length": 709.8125, + "completions/mean_terminated_length": 704.5647583007812, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, "epoch": 0.6022019288213707, - "grad_norm": 2.9331674575805664, - "kl": 4.86328125, - "learning_rate": 4.6869256369804353e-07, - "loss": 0.3337, - "num_tokens": 976778484.0, - "reward": 1.8681640625, - "reward_std": 0.5320004224777222, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.1779162585735321, + "grad_norm": 2.0475871562957764, + "kl": 2.287109375, + "learning_rate": 4.689258122869463e-07, + "loss": 0.145, + "num_tokens": 1013680237.0, + "reward": 1.10693359375, + "reward_std": 0.26224619150161743, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.95849609375, + "rewards/tag_count_reward/std": 0.131270632147789, "step": 1764 }, { @@ -51171,27 +51171,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 834.357421875, - "completions/mean_terminated_length": 785.0223388671875, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1915.0, + "completions/max_terminated_length": 1915.0, + "completions/mean_length": 721.408203125, + "completions/mean_terminated_length": 721.408203125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.6025433131347615, - "grad_norm": 1.946372628211975, - "kl": 5.8671875, - "learning_rate": 4.6816533717603093e-07, - "loss": 0.3678, - "num_tokens": 977282603.0, - "reward": 1.822265625, - "reward_std": 0.4897192716598511, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.19042283296585083, + "grad_norm": 6.5366435050964355, + "kl": 2.6171875, + "learning_rate": 4.683983346189656e-07, + "loss": 0.1781, + "num_tokens": 1014126526.0, + "reward": 1.0556640625, + "reward_std": 0.23004157841205597, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.9501953125, + "rewards/tag_count_reward/std": 0.1570582091808319, "step": 1765 }, { @@ -51200,27 +51200,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 851.9609375, - "completions/mean_terminated_length": 785.3773193359375, - "completions/min_length": 182.0, - "completions/min_terminated_length": 182.0, + "completions/max_terminated_length": 1730.0, + "completions/mean_length": 720.806640625, + "completions/mean_terminated_length": 712.9843139648438, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.6028846974481522, - "grad_norm": 2.542891025543213, - "kl": 8.015625, - "learning_rate": 4.676382268033544e-07, - "loss": 0.4818, - "num_tokens": 977793479.0, - "reward": 1.78271484375, - "reward_std": 0.5828242897987366, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.21019525825977325, + "grad_norm": 2.583763360977173, + "kl": 3.494140625, + "learning_rate": 4.6787097285751487e-07, + "loss": 0.1885, + "num_tokens": 1014570251.0, + "reward": 1.08349609375, + "reward_std": 0.2778605818748474, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.06243881583213806, + "rewards/tag_count_reward/mean": 0.94873046875, + "rewards/tag_count_reward/std": 0.15142296254634857, "step": 1766 }, { @@ -51229,27 +51229,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1979.0, - "completions/mean_length": 872.7109375, - "completions/mean_terminated_length": 812.3778686523438, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 728.986328125, + "completions/mean_terminated_length": 723.8137817382812, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.603226081761543, - "grad_norm": 3.6636009216308594, - "kl": 8.1875, - "learning_rate": 4.671112333281508e-07, - "loss": 0.472, - "num_tokens": 978319091.0, - "reward": 1.7412109375, - "reward_std": 0.6191123127937317, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.787109375, - "rewards/format_reward/std": 0.409751296043396, - "rewards/tag_count_reward/mean": 0.8916015625, - "rewards/tag_count_reward/std": 0.22221922874450684, + "grad_norm": 2.0975725650787354, + "kl": 3.8203125, + "learning_rate": 4.673437277516559e-07, + "loss": 0.2293, + "num_tokens": 1015022276.0, + "reward": 1.0146484375, + "reward_std": 0.26882147789001465, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.1898675262928009, "step": 1767 }, { @@ -51258,27 +51258,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 807.677734375, - "completions/mean_terminated_length": 759.876220703125, - "completions/min_length": 224.0, - "completions/min_terminated_length": 224.0, + "completions/max_terminated_length": 1488.0, + "completions/mean_length": 663.76953125, + "completions/mean_terminated_length": 661.0606689453125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.6035674660749338, - "grad_norm": 1.4426823854446411, - "kl": 6.58984375, - "learning_rate": 4.6658435749839087e-07, - "loss": 0.3797, - "num_tokens": 978810686.0, - "reward": 1.8359375, - "reward_std": 0.5612466335296631, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.1972164362668991, + "grad_norm": 4.848918437957764, + "kl": 3.8203125, + "learning_rate": 4.668166000502842e-07, + "loss": 0.171, + "num_tokens": 1015440190.0, + "reward": 1.07373046875, + "reward_std": 0.24046632647514343, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.95263671875, + "rewards/tag_count_reward/std": 0.1444602608680725, "step": 1768 }, { @@ -51287,27 +51287,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 808.4453125, - "completions/mean_terminated_length": 768.4596557617188, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_terminated_length": 1724.0, + "completions/mean_length": 672.0546875, + "completions/mean_terminated_length": 666.6588745117188, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, "epoch": 0.6039088503883246, - "grad_norm": 3.0837481021881104, - "kl": 8.3828125, - "learning_rate": 4.6605760006187857e-07, - "loss": 0.4804, - "num_tokens": 979296594.0, - "reward": 1.814453125, - "reward_std": 0.5509793758392334, - "rewards/accuracy_reward/mean": 0.09677419066429138, - "rewards/accuracy_reward/std": 0.2959485352039337, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.912109375, - "rewards/tag_count_reward/std": 0.19990174472332, + "grad_norm": 1.7236839532852173, + "kl": 4.12109375, + "learning_rate": 4.6628959050212936e-07, + "loss": 0.2199, + "num_tokens": 1015856266.0, + "reward": 1.09765625, + "reward_std": 0.27422866225242615, + "rewards/accuracy_reward/mean": 0.14717741310596466, + "rewards/accuracy_reward/std": 0.354640394449234, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.1647689789533615, "step": 1769 }, { @@ -51316,27 +51316,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 833.59375, - "completions/mean_terminated_length": 789.3441772460938, - "completions/min_length": 86.0, - "completions/min_terminated_length": 86.0, + "completions/max_terminated_length": 1780.0, + "completions/mean_length": 676.41015625, + "completions/mean_terminated_length": 673.7260131835938, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.6042502347017155, - "grad_norm": 1.2210311889648438, - "kl": 6.515625, - "learning_rate": 4.655309617662496e-07, - "loss": 0.3899, - "num_tokens": 979796498.0, - "reward": 1.78857421875, - "reward_std": 0.5330498814582825, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.19277119636535645, + "grad_norm": 10.14164924621582, + "kl": 4.3828125, + "learning_rate": 4.657626998557522e-07, + "loss": 0.2248, + "num_tokens": 1016275692.0, + "reward": 0.9912109375, + "reward_std": 0.24511289596557617, + "rewards/accuracy_reward/mean": 0.04233871027827263, + "rewards/accuracy_reward/std": 0.2015640139579773, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.17226234078407288, "step": 1770 }, { @@ -51345,27 +51345,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 808.107421875, - "completions/mean_terminated_length": 765.5252685546875, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 1815.0, + "completions/mean_length": 698.900390625, + "completions/mean_terminated_length": 696.26025390625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.6045916190151063, - "grad_norm": 1.361059308052063, - "kl": 6.078125, - "learning_rate": 4.6500444335897094e-07, - "loss": 0.379, - "num_tokens": 980292009.0, - "reward": 1.8291015625, - "reward_std": 0.5372997522354126, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.9287109375, - "rewards/tag_count_reward/std": 0.18845312297344208, + "grad_norm": 4.63817834854126, + "kl": 4.0234375, + "learning_rate": 4.6523592885954553e-07, + "loss": 0.2464, + "num_tokens": 1016715289.0, + "reward": 1.04296875, + "reward_std": 0.2594906687736511, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.170570507645607, "step": 1771 }, { @@ -51374,27 +51374,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 869.541015625, - "completions/mean_terminated_length": 826.6012573242188, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 724.939453125, + "completions/mean_terminated_length": 722.3502807617188, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.6049330033284971, - "grad_norm": 0.9149605631828308, - "kl": 7.703125, - "learning_rate": 4.6447804558733894e-07, - "loss": 0.4712, - "num_tokens": 980813406.0, - "reward": 1.7626953125, - "reward_std": 0.5920639038085938, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.8046875, - "rewards/format_reward/std": 0.3968288004398346, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.21200865507125854, + "grad_norm": 2.2953732013702393, + "kl": 3.63671875, + "learning_rate": 4.6470927826173155e-07, + "loss": 0.2088, + "num_tokens": 1017162650.0, + "reward": 1.013671875, + "reward_std": 0.272574245929718, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.16624696552753448, "step": 1772 }, { @@ -51403,27 +51403,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 873.810546875, - "completions/mean_terminated_length": 828.5578002929688, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2019.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 723.96875, + "completions/mean_terminated_length": 723.96875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, "epoch": 0.6052743876418879, - "grad_norm": 1.3184945583343506, - "kl": 6.984375, - "learning_rate": 4.6395176919847923e-07, - "loss": 0.4523, - "num_tokens": 981341549.0, - "reward": 1.8056640625, - "reward_std": 0.529266893863678, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.18835169076919556, + "grad_norm": 3.4601573944091797, + "kl": 3.2109375, + "learning_rate": 4.641827488103619e-07, + "loss": 0.1665, + "num_tokens": 1017614074.0, + "reward": 1.03271484375, + "reward_std": 0.286705881357193, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.17503775656223297, "step": 1773 }, { @@ -51432,27 +51432,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 920.623046875, - "completions/mean_terminated_length": 865.17822265625, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 735.119140625, + "completions/mean_terminated_length": 729.9706420898438, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, "epoch": 0.6056157719552786, - "grad_norm": 0.7841996550559998, - "kl": 7.484375, - "learning_rate": 4.634256149393445e-07, - "loss": 0.4723, - "num_tokens": 981891260.0, - "reward": 1.80615234375, - "reward_std": 0.5783164501190186, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.2175668478012085, + "grad_norm": 4.324336051940918, + "kl": 3.8515625, + "learning_rate": 4.6365634125331566e-07, + "loss": 0.2169, + "num_tokens": 1018068807.0, + "reward": 1.04150390625, + "reward_std": 0.28216075897216797, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.17113468050956726, "step": 1774 }, { @@ -51461,27 +51461,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 841.51171875, - "completions/mean_terminated_length": 787.3428344726562, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 1787.0, + "completions/mean_length": 714.685546875, + "completions/mean_terminated_length": 709.4569091796875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.6059571562686694, - "grad_norm": 2.017267942428589, - "kl": 8.0546875, - "learning_rate": 4.6289958355671475e-07, - "loss": 0.5094, - "num_tokens": 982402210.0, - "reward": 1.7998046875, - "reward_std": 0.5805681943893433, - "rewards/accuracy_reward/mean": 0.06854838877916336, - "rewards/accuracy_reward/std": 0.25293973088264465, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.21713875234127045, + "grad_norm": 2.312756299972534, + "kl": 3.185546875, + "learning_rate": 4.631300563382994e-07, + "loss": 0.1781, + "num_tokens": 1018514822.0, + "reward": 1.02685546875, + "reward_std": 0.27983659505844116, + "rewards/accuracy_reward/mean": 0.08870967477560043, + "rewards/accuracy_reward/std": 0.2846112847328186, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.19102340936660767, "step": 1775 }, { @@ -51490,27 +51490,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 795.16015625, - "completions/mean_terminated_length": 733.5450439453125, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 1800.0, + "completions/mean_length": 704.3828125, + "completions/mean_terminated_length": 699.11376953125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.6062985405820602, - "grad_norm": 1.6451234817504883, - "kl": 7.171875, - "learning_rate": 4.6237367579719535e-07, - "loss": 0.4567, - "num_tokens": 982886500.0, - "reward": 1.84814453125, - "reward_std": 0.5437964200973511, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.20619770884513855, + "grad_norm": 2.4387314319610596, + "kl": 3.03515625, + "learning_rate": 4.626038948128448e-07, + "loss": 0.1833, + "num_tokens": 1018952634.0, + "reward": 1.05908203125, + "reward_std": 0.25380903482437134, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.94384765625, + "rewards/tag_count_reward/std": 0.15763740241527557, "step": 1776 }, { @@ -51519,27 +51519,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 888.806640625, - "completions/mean_terminated_length": 819.20703125, - "completions/min_length": 215.0, - "completions/min_terminated_length": 215.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 733.876953125, + "completions/mean_terminated_length": 731.3052978515625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, "epoch": 0.606639924895451, - "grad_norm": 2.459740161895752, - "kl": 8.1015625, - "learning_rate": 4.618478924072159e-07, - "loss": 0.5034, - "num_tokens": 983413681.0, - "reward": 1.826171875, - "reward_std": 0.5603066086769104, - "rewards/accuracy_reward/mean": 0.08064515888690948, - "rewards/accuracy_reward/std": 0.2725643217563629, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.21027308702468872, + "grad_norm": 6.01278829574585, + "kl": 1.82421875, + "learning_rate": 4.6207785742430895e-07, + "loss": 0.1036, + "num_tokens": 1019400491.0, + "reward": 1.0712890625, + "reward_std": 0.25445666909217834, + "rewards/accuracy_reward/mean": 0.12298387289047241, + "rewards/accuracy_reward/std": 0.32875028252601624, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.16588735580444336, "step": 1777 }, { @@ -51548,27 +51548,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 778.66796875, - "completions/mean_terminated_length": 721.677490234375, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 715.544921875, + "completions/mean_terminated_length": 707.6915893554688, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.6069813092088419, - "grad_norm": 2.2952628135681152, - "kl": 5.4375, - "learning_rate": 4.6132223413303e-07, - "loss": 0.369, - "num_tokens": 983888807.0, - "reward": 1.88818359375, - "reward_std": 0.5219037532806396, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.18844488263130188, + "grad_norm": 1.677288293838501, + "kl": 2.98828125, + "learning_rate": 4.615519449198719e-07, + "loss": 0.2031, + "num_tokens": 1019843298.0, + "reward": 1.0029296875, + "reward_std": 0.234297513961792, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.18000927567481995, "step": 1778 }, { @@ -51577,27 +51577,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 869.060546875, - "completions/mean_terminated_length": 803.4288940429688, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 1846.0, + "completions/mean_length": 743.66015625, + "completions/mean_terminated_length": 741.1076049804688, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.6073226935222327, - "grad_norm": 1.1419627666473389, - "kl": 6.7265625, - "learning_rate": 4.607967017207135e-07, - "loss": 0.4287, - "num_tokens": 984408246.0, - "reward": 1.82421875, - "reward_std": 0.5274852514266968, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.20715993642807007, + "grad_norm": 4.2561516761779785, + "kl": 2.01953125, + "learning_rate": 4.6102615804653724e-07, + "loss": 0.0626, + "num_tokens": 1020298532.0, + "reward": 1.07177734375, + "reward_std": 0.2906448245048523, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.94677734375, + "rewards/tag_count_reward/std": 0.15787968039512634, "step": 1779 }, { @@ -51606,27 +51606,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0703125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 941.478515625, - "completions/mean_terminated_length": 857.7920532226562, - "completions/min_length": 19.0, - "completions/min_terminated_length": 19.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 798.525390625, + "completions/mean_terminated_length": 796.0802001953125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.6076640778356235, - "grad_norm": 1.2240148782730103, - "kl": 8.890625, - "learning_rate": 4.602712959161633e-07, - "loss": 0.5813, - "num_tokens": 984972603.0, - "reward": 1.72705078125, - "reward_std": 0.6361920833587646, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.798828125, - "rewards/format_reward/std": 0.4012683033943176, - "rewards/tag_count_reward/mean": 0.89111328125, - "rewards/tag_count_reward/std": 0.23404912650585175, + "grad_norm": 1.9754955768585205, + "kl": 1.96875, + "learning_rate": 4.6050049755112906e-07, + "loss": 0.1122, + "num_tokens": 1020789697.0, + "reward": 1.03515625, + "reward_std": 0.2722569704055786, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.17345911264419556, "step": 1780 }, { @@ -51635,27 +51635,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 802.990234375, - "completions/mean_terminated_length": 760.2323608398438, - "completions/min_length": 64.0, - "completions/min_terminated_length": 64.0, + "completions/max_terminated_length": 1748.0, + "completions/mean_length": 710.72265625, + "completions/mean_terminated_length": 700.1929321289062, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, "epoch": 0.6080054621490143, - "grad_norm": 2.643549919128418, - "kl": 5.59375, - "learning_rate": 4.5974601746509687e-07, - "loss": 0.3628, - "num_tokens": 985459318.0, - "reward": 1.85791015625, - "reward_std": 0.5174456834793091, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.1944885402917862, + "grad_norm": 6.035069942474365, + "kl": 2.59375, + "learning_rate": 4.599749641802928e-07, + "loss": 0.2463, + "num_tokens": 1021229171.0, + "reward": 1.046875, + "reward_std": 0.23933148384094238, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.14755012094974518, "step": 1781 }, { @@ -51664,27 +51664,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 801.52734375, - "completions/mean_terminated_length": 776.6972045898438, - "completions/min_length": 36.0, - "completions/min_terminated_length": 36.0, + "completions/max_terminated_length": 1812.0, + "completions/mean_length": 736.779296875, + "completions/mean_terminated_length": 726.4547119140625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, "epoch": 0.608346846462405, - "grad_norm": 2.1473066806793213, - "kl": 5.1171875, - "learning_rate": 4.592208671130511e-07, - "loss": 0.3116, - "num_tokens": 985940276.0, - "reward": 1.880859375, - "reward_std": 0.4827001094818115, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.16916421055793762, + "grad_norm": 5.188677787780762, + "kl": 2.197265625, + "learning_rate": 4.5944955868049276e-07, + "loss": 0.1452, + "num_tokens": 1021676978.0, + "reward": 1.0986328125, + "reward_std": 0.3167649507522583, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.17083640396595, "step": 1782 }, { @@ -51693,27 +51693,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 885.35546875, - "completions/mean_terminated_length": 825.6715087890625, - "completions/min_length": 201.0, - "completions/min_terminated_length": 201.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 786.251953125, + "completions/mean_terminated_length": 776.3169555664062, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, "epoch": 0.6086882307757958, - "grad_norm": 1.3922321796417236, - "kl": 8.65625, - "learning_rate": 4.5869584560538065e-07, - "loss": 0.5819, - "num_tokens": 986471658.0, - "reward": 1.8017578125, - "reward_std": 0.5646194815635681, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.20736047625541687, + "grad_norm": 1.9753355979919434, + "kl": 2.0859375, + "learning_rate": 4.5892428179801213e-07, + "loss": 0.1149, + "num_tokens": 1022157619.0, + "reward": 1.03369140625, + "reward_std": 0.23831097781658173, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.94384765625, + "rewards/tag_count_reward/std": 0.16887517273426056, "step": 1783 }, { @@ -51722,27 +51722,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 896.87109375, - "completions/mean_terminated_length": 835.2880249023438, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 803.28515625, + "completions/mean_terminated_length": 793.4842529296875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, "epoch": 0.6090296150891866, - "grad_norm": 2.539964437484741, - "kl": 7.078125, - "learning_rate": 4.5817095368725754e-07, - "loss": 0.4367, - "num_tokens": 987017864.0, - "reward": 1.81103515625, - "reward_std": 0.5181088447570801, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.19693946838378906, + "grad_norm": 3.008613348007202, + "kl": 2.6015625, + "learning_rate": 4.5839913427895083e-07, + "loss": 0.1611, + "num_tokens": 1022655909.0, + "reward": 1.005859375, + "reward_std": 0.2187550961971283, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.17270830273628235, "step": 1784 }, { @@ -51751,27 +51751,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 829.134765625, - "completions/mean_terminated_length": 771.8057250976562, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 754.59375, + "completions/mean_terminated_length": 744.409423828125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.6093709994025774, - "grad_norm": 1.532462239265442, - "kl": 6.3046875, - "learning_rate": 4.576461921036702e-07, - "loss": 0.373, - "num_tokens": 987520045.0, - "reward": 1.837890625, - "reward_std": 0.5260030031204224, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.18913386762142181, + "grad_norm": 5.499574661254883, + "kl": 3.171875, + "learning_rate": 4.578741168692256e-07, + "loss": 0.1854, + "num_tokens": 1023119925.0, + "reward": 1.06005859375, + "reward_std": 0.27437031269073486, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.18187542259693146, "step": 1785 }, { @@ -51780,27 +51780,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1911.0, - "completions/mean_length": 769.2421875, - "completions/mean_terminated_length": 730.6478881835938, - "completions/min_length": 9.0, - "completions/min_terminated_length": 9.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 739.7265625, + "completions/mean_terminated_length": 732.0157470703125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, "epoch": 0.6097123837159683, - "grad_norm": 1.843642234802246, - "kl": 6.5234375, - "learning_rate": 4.571215615994212e-07, - "loss": 0.3969, - "num_tokens": 987992569.0, - "reward": 1.791015625, - "reward_std": 0.515088677406311, - "rewards/accuracy_reward/mean": 0.021484375, - "rewards/accuracy_reward/std": 0.14513419568538666, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.2005603015422821, + "grad_norm": 4.903466701507568, + "kl": 2.9921875, + "learning_rate": 4.5734923031456783e-07, + "loss": 0.1803, + "num_tokens": 1023577337.0, + "reward": 0.994140625, + "reward_std": 0.2194230556488037, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.1939331740140915, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.16840559244155884, "step": 1786 }, { @@ -51809,27 +51809,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 830.81640625, - "completions/mean_terminated_length": 778.7576904296875, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 1922.0, + "completions/mean_length": 783.1953125, + "completions/mean_terminated_length": 775.74072265625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.6100537680293591, - "grad_norm": 2.2469699382781982, - "kl": 7.3203125, - "learning_rate": 4.565970629191278e-07, - "loss": 0.4796, - "num_tokens": 988498443.0, - "reward": 1.83935546875, - "reward_std": 0.5874500274658203, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.20386749505996704, + "grad_norm": 3.9970672130584717, + "kl": 4.25, + "learning_rate": 4.568244753605237e-07, + "loss": 0.2438, + "num_tokens": 1024058829.0, + "reward": 1.07861328125, + "reward_std": 0.28232330083847046, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.021484375, + "rewards/format_reward/std": 0.14513419568538666, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.17111793160438538, "step": 1787 }, { @@ -51838,27 +51838,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 836.029296875, - "completions/mean_terminated_length": 773.8131713867188, - "completions/min_length": 89.0, - "completions/min_terminated_length": 89.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 782.5, + "completions/mean_terminated_length": 780.0234985351562, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.6103951523427499, - "grad_norm": 1.9404956102371216, - "kl": 6.78125, - "learning_rate": 4.5607269680721993e-07, - "loss": 0.4275, - "num_tokens": 989004986.0, - "reward": 1.83154296875, - "reward_std": 0.5891203880310059, - "rewards/accuracy_reward/mean": 0.08467742055654526, - "rewards/accuracy_reward/std": 0.278682142496109, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20348748564720154, + "grad_norm": 1.9265402555465698, + "kl": 3.02734375, + "learning_rate": 4.5629985275245174e-07, + "loss": 0.1676, + "num_tokens": 1024537965.0, + "reward": 1.06884765625, + "reward_std": 0.30155232548713684, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310528099536896, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.17324896156787872, "step": 1788 }, { @@ -51867,27 +51867,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 855.521484375, - "completions/mean_terminated_length": 812.0708618164062, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 1517.0, + "completions/mean_length": 771.45703125, + "completions/mean_terminated_length": 766.4510498046875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.6107365366561407, - "grad_norm": 3.4830024242401123, - "kl": 5.78125, - "learning_rate": 4.5554846400793946e-07, - "loss": 0.3474, - "num_tokens": 989519189.0, - "reward": 1.8271484375, - "reward_std": 0.524437427520752, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.20418420433998108, + "grad_norm": 2.423401117324829, + "kl": 2.548828125, + "learning_rate": 4.557753632355231e-07, + "loss": 0.1423, + "num_tokens": 1025009127.0, + "reward": 1.05712890625, + "reward_std": 0.2346060425043106, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.94580078125, + "rewards/tag_count_reward/std": 0.160621777176857, "step": 1789 }, { @@ -51896,27 +51896,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1979.0, - "completions/mean_length": 869.517578125, - "completions/mean_terminated_length": 819.1140747070312, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 810.84375, + "completions/mean_terminated_length": 801.1023559570312, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.6110779209695314, - "grad_norm": 1.4256885051727295, - "kl": 5.33984375, - "learning_rate": 4.5502436526533896e-07, - "loss": 0.3369, - "num_tokens": 990048414.0, - "reward": 1.81103515625, - "reward_std": 0.540465235710144, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.2020832896232605, + "grad_norm": 1.7359230518341064, + "kl": 3.4296875, + "learning_rate": 4.5525100755471934e-07, + "loss": 0.1985, + "num_tokens": 1025508311.0, + "reward": 1.060546875, + "reward_std": 0.2512783408164978, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.16840559244155884, "step": 1790 }, { @@ -51925,27 +51925,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 830.203125, - "completions/mean_terminated_length": 783.269775390625, - "completions/min_length": 84.0, - "completions/min_terminated_length": 84.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 758.826171875, + "completions/mean_terminated_length": 751.2279052734375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, "epoch": 0.6114193052829222, - "grad_norm": 1.4382879734039307, - "kl": 6.2109375, - "learning_rate": 4.5450040132328074e-07, - "loss": 0.3999, - "num_tokens": 990549542.0, - "reward": 1.818359375, - "reward_std": 0.5191929340362549, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.20497123897075653, + "grad_norm": 4.741996765136719, + "kl": 2.685546875, + "learning_rate": 4.5472678645483264e-07, + "loss": 0.1593, + "num_tokens": 1025972894.0, + "reward": 1.04541015625, + "reward_std": 0.2292695790529251, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.95751953125, + "rewards/tag_count_reward/std": 0.14172235131263733, "step": 1791 }, { @@ -51954,27 +51954,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 764.9296875, - "completions/mean_terminated_length": 744.5635375976562, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 1929.0, + "completions/mean_length": 720.42578125, + "completions/mean_terminated_length": 712.6011962890625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.611760689596313, - "grad_norm": 3.7264513969421387, - "kl": 4.953125, - "learning_rate": 4.539765729254356e-07, - "loss": 0.3689, - "num_tokens": 991009922.0, - "reward": 1.89794921875, - "reward_std": 0.5064840912818909, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94287109375, - "rewards/tag_count_reward/std": 0.16781944036483765, + "grad_norm": 2.515331745147705, + "kl": 3.34765625, + "learning_rate": 4.5420270068046315e-07, + "loss": 0.1812, + "num_tokens": 1026410488.0, + "reward": 1.08447265625, + "reward_std": 0.29895588755607605, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.94970703125, + "rewards/tag_count_reward/std": 0.16037173569202423, "step": 1792 }, { @@ -51983,27 +51983,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 868.408203125, - "completions/mean_terminated_length": 812.9263916015625, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 790.01953125, + "completions/mean_terminated_length": 782.6051635742188, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.6121020739097038, - "grad_norm": 2.518594264984131, - "kl": 5.83984375, - "learning_rate": 4.5345288081528223e-07, - "loss": 0.4031, - "num_tokens": 991533651.0, - "reward": 1.77587890625, - "reward_std": 0.545759916305542, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.20959889888763428, + "grad_norm": 3.2399539947509766, + "kl": 2.69140625, + "learning_rate": 4.536787509760196e-07, + "loss": 0.1414, + "num_tokens": 1026894082.0, + "reward": 1.02734375, + "reward_std": 0.24196061491966248, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.1523297280073166, "step": 1793 }, { @@ -52012,27 +52012,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1923.0, - "completions/mean_length": 859.126953125, - "completions/mean_terminated_length": 790.34912109375, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 1664.0, + "completions/mean_length": 783.7890625, + "completions/mean_terminated_length": 776.3379516601562, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.6124434582230946, - "grad_norm": 1.2283124923706055, - "kl": 6.4296875, - "learning_rate": 4.529293257361059e-07, - "loss": 0.3984, - "num_tokens": 992054148.0, - "reward": 1.79736328125, - "reward_std": 0.5642428994178772, + "grad_norm": 2.166257858276367, + "kl": 3.046875, + "learning_rate": 4.531549380857168e-07, + "loss": 0.1788, + "num_tokens": 1027376006.0, + "reward": 1.0302734375, + "reward_std": 0.2507689595222473, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.90478515625, - "rewards/tag_count_reward/std": 0.21916839480400085, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.9462890625, + "rewards/tag_count_reward/std": 0.16192306578159332, "step": 1794 }, { @@ -52041,27 +52041,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 761.9375, - "completions/mean_terminated_length": 725.7830810546875, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 1891.0, + "completions/mean_length": 738.693359375, + "completions/mean_terminated_length": 736.131103515625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.6127848425364855, - "grad_norm": 1.0940067768096924, - "kl": 4.1796875, - "learning_rate": 4.5240590843099725e-07, - "loss": 0.2422, - "num_tokens": 992532868.0, - "reward": 1.8994140625, - "reward_std": 0.4732905924320221, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.17729215323925018, + "grad_norm": 7.018588542938232, + "kl": 3.56640625, + "learning_rate": 4.5263126275357575e-07, + "loss": 0.1786, + "num_tokens": 1027842825.0, + "reward": 1.10546875, + "reward_std": 0.3067556619644165, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.17341503500938416, "step": 1795 }, { @@ -52070,27 +52070,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1960.0, - "completions/mean_length": 836.771484375, - "completions/mean_terminated_length": 782.3897705078125, - "completions/min_length": 82.0, - "completions/min_terminated_length": 82.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1579.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 746.927734375, + "completions/mean_terminated_length": 746.927734375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.6131262268498763, - "grad_norm": 1.637098789215088, - "kl": 5.5546875, - "learning_rate": 4.5188262964285126e-07, - "loss": 0.3707, - "num_tokens": 993043439.0, - "reward": 1.85009765625, - "reward_std": 0.4896104633808136, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.18488718569278717, + "grad_norm": 1.7091294527053833, + "kl": 1.48046875, + "learning_rate": 4.521077257234217e-07, + "loss": 0.0607, + "num_tokens": 1028307396.0, + "reward": 1.0498046875, + "reward_std": 0.2198580652475357, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9638671875, + "rewards/tag_count_reward/std": 0.13053062558174133, "step": 1796 }, { @@ -52099,27 +52099,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1974.0, - "completions/mean_length": 796.544921875, - "completions/mean_terminated_length": 737.6829833984375, - "completions/min_length": 233.0, - "completions/min_terminated_length": 233.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 743.974609375, + "completions/mean_terminated_length": 733.7066650390625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.6134676111632671, - "grad_norm": 1.0109014511108398, - "kl": 7.0390625, - "learning_rate": 4.513594901143668e-07, - "loss": 0.4573, - "num_tokens": 993528486.0, - "reward": 1.8330078125, - "reward_std": 0.4917706251144409, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.18266506493091583, + "grad_norm": 5.36768913269043, + "kl": 2.41015625, + "learning_rate": 4.515843277388839e-07, + "loss": 0.1004, + "num_tokens": 1028765527.0, + "reward": 1.044921875, + "reward_std": 0.24271979928016663, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.025390625, + "rewards/format_reward/std": 0.15746226906776428, + "rewards/tag_count_reward/mean": 0.955078125, + "rewards/tag_count_reward/std": 0.1422615498304367, "step": 1797 }, { @@ -52128,27 +52128,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 802.662109375, - "completions/mean_terminated_length": 757.2854614257812, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 798.8671875, + "completions/mean_terminated_length": 796.4226684570312, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.6138089954766578, - "grad_norm": 1.6239712238311768, - "kl": 5.8203125, - "learning_rate": 4.508364905880444e-07, - "loss": 0.3338, - "num_tokens": 994030281.0, - "reward": 1.8154296875, - "reward_std": 0.4730170965194702, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.19093148410320282, + "grad_norm": 2.0743041038513184, + "kl": 3.09375, + "learning_rate": 4.5106106954339327e-07, + "loss": 0.1669, + "num_tokens": 1029265379.0, + "reward": 1.0107421875, + "reward_std": 0.2517067492008209, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.17660093307495117, "step": 1798 }, { @@ -52157,27 +52157,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 836.919921875, - "completions/mean_terminated_length": 790.2454223632812, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 800.849609375, + "completions/mean_terminated_length": 786.061279296875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, "epoch": 0.6141503797900486, - "grad_norm": 4.2510151863098145, - "kl": 9.7265625, - "learning_rate": 4.503136318061863e-07, - "loss": 0.6063, - "num_tokens": 994535536.0, - "reward": 1.7734375, - "reward_std": 0.5860233306884766, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.904296875, - "rewards/tag_count_reward/std": 0.2203473001718521, + "grad_norm": 3.665811061859131, + "kl": 3.015625, + "learning_rate": 4.5053795188018316e-07, + "loss": 0.2191, + "num_tokens": 1029752166.0, + "reward": 1.0166015625, + "reward_std": 0.2390669882297516, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.17727059125900269, "step": 1799 }, { @@ -52186,27 +52186,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1873.0, - "completions/mean_length": 739.123046875, - "completions/mean_terminated_length": 699.6196899414062, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 1679.0, + "completions/mean_length": 721.693359375, + "completions/mean_terminated_length": 719.0978393554688, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.6144917641034394, - "grad_norm": 2.4480273723602295, - "kl": 7.953125, - "learning_rate": 4.497909145108949e-07, - "loss": 0.4878, - "num_tokens": 994994879.0, - "reward": 1.828125, - "reward_std": 0.5364860892295837, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.1931036412715912, + "grad_norm": 4.459508895874023, + "kl": 1.3291015625, + "learning_rate": 4.5001497549228653e-07, + "loss": 0.0465, + "num_tokens": 1030202585.0, + "reward": 1.0693359375, + "reward_std": 0.21111838519573212, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9677734375, + "rewards/tag_count_reward/std": 0.12190000712871552, "step": 1800 }, { @@ -52215,27 +52215,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1919.0, - "completions/mean_length": 746.984375, - "completions/mean_terminated_length": 713.0902099609375, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 737.447265625, + "completions/mean_terminated_length": 732.307861328125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.6148331484168302, - "grad_norm": 3.0909080505371094, - "kl": 6.66796875, - "learning_rate": 4.4926833944407207e-07, - "loss": 0.3548, - "num_tokens": 995446583.0, - "reward": 1.8310546875, - "reward_std": 0.5081585645675659, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.18964596092700958, + "grad_norm": 2.111485719680786, + "kl": 1.927734375, + "learning_rate": 4.494921411225363e-07, + "loss": 0.094, + "num_tokens": 1030649406.0, + "reward": 1.08642578125, + "reward_std": 0.2624194025993347, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.96337890625, + "rewards/tag_count_reward/std": 0.13271848857402802, "step": 1801 }, { @@ -52244,27 +52244,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 723.724609375, - "completions/mean_terminated_length": 675.4716796875, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 698.75390625, + "completions/mean_terminated_length": 696.113525390625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.615174532730221, - "grad_norm": 2.3278188705444336, - "kl": 6.2265625, - "learning_rate": 4.4874590734741715e-07, - "loss": 0.3666, - "num_tokens": 995890890.0, - "reward": 1.9013671875, - "reward_std": 0.5298936367034912, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.1889999359846115, + "grad_norm": 7.498244762420654, + "kl": 1.900390625, + "learning_rate": 4.4896944951356295e-07, + "loss": 0.1191, + "num_tokens": 1031080928.0, + "reward": 1.1865234375, + "reward_std": 0.3081634044647217, + "rewards/accuracy_reward/mean": 0.212890625, + "rewards/accuracy_reward/std": 0.409751296043396, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.9560546875, + "rewards/tag_count_reward/std": 0.14511774480342865, "step": 1802 }, { @@ -52273,27 +52273,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1858.0, - "completions/mean_length": 806.29296875, - "completions/mean_terminated_length": 758.4381103515625, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 767.427734375, + "completions/mean_terminated_length": 764.9216918945312, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.6155159170436119, - "grad_norm": 1.382660984992981, - "kl": 6.4765625, - "learning_rate": 4.4822361896242734e-07, - "loss": 0.4099, - "num_tokens": 996374256.0, - "reward": 1.85546875, - "reward_std": 0.5291862487792969, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.2070399820804596, + "grad_norm": 4.030727386474609, + "kl": 1.4384765625, + "learning_rate": 4.484469014077953e-07, + "loss": 0.0726, + "num_tokens": 1031544395.0, + "reward": 1.04345703125, + "reward_std": 0.22301135957241058, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.95751953125, + "rewards/tag_count_reward/std": 0.1484660655260086, "step": 1803 }, { @@ -52302,27 +52302,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 767.041015625, - "completions/mean_terminated_length": 725.7197265625, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_terminated_length": 1747.0, + "completions/mean_length": 709.197265625, + "completions/mean_terminated_length": 706.5772705078125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.6158573013570027, - "grad_norm": 1.1367708444595337, - "kl": 5.271484375, - "learning_rate": 4.477014750303951e-07, - "loss": 0.3426, - "num_tokens": 996849669.0, - "reward": 1.8408203125, - "reward_std": 0.4900355339050293, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9287109375, - "rewards/tag_count_reward/std": 0.1935756355524063, + "grad_norm": 3.7080092430114746, + "kl": 1.3154296875, + "learning_rate": 4.479244975474569e-07, + "loss": 0.0729, + "num_tokens": 1031990192.0, + "reward": 1.04248046875, + "reward_std": 0.18790854513645172, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.96435546875, + "rewards/tag_count_reward/std": 0.1320616751909256, "step": 1804 }, { @@ -52331,27 +52331,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1948.0, - "completions/mean_length": 764.02734375, - "completions/mean_terminated_length": 727.9317016601562, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 1892.0, + "completions/mean_length": 752.2421875, + "completions/mean_terminated_length": 747.1608276367188, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.6161986856703935, - "grad_norm": 1.0454238653182983, - "kl": 5.19921875, - "learning_rate": 4.471794762924084e-07, - "loss": 0.3482, - "num_tokens": 997314259.0, - "reward": 1.9091796875, - "reward_std": 0.47368374466896057, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.16946613788604736, + "grad_norm": 2.501222848892212, + "kl": 1.84375, + "learning_rate": 4.4740223867456737e-07, + "loss": 0.1341, + "num_tokens": 1032448748.0, + "reward": 1.109375, + "reward_std": 0.2864750027656555, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.955078125, + "rewards/tag_count_reward/std": 0.15303067862987518, "step": 1805 }, { @@ -52360,27 +52360,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1908.0, - "completions/mean_length": 706.70703125, - "completions/mean_terminated_length": 688.1148681640625, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 1723.0, + "completions/mean_length": 707.9609375, + "completions/mean_terminated_length": 697.409423828125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.6165400699837843, - "grad_norm": 3.4908785820007324, - "kl": 4.6171875, - "learning_rate": 4.46657623489349e-07, - "loss": 0.317, - "num_tokens": 997757597.0, - "reward": 1.91357421875, - "reward_std": 0.5355505347251892, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.17360158264636993, + "grad_norm": 10.757259368896484, + "kl": 2.1181640625, + "learning_rate": 4.4688012553094033e-07, + "loss": 0.1948, + "num_tokens": 1032892728.0, + "reward": 1.07470703125, + "reward_std": 0.1719028353691101, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.96142578125, + "rewards/tag_count_reward/std": 0.13760361075401306, "step": 1806 }, { @@ -52389,27 +52389,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1768.0, - "completions/mean_length": 762.546875, - "completions/mean_terminated_length": 723.75048828125, - "completions/min_length": 2.0, - "completions/min_terminated_length": 2.0, + "completions/max_terminated_length": 1758.0, + "completions/mean_length": 784.080078125, + "completions/mean_terminated_length": 774.1279296875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.616881454297175, - "grad_norm": 1.0735392570495605, - "kl": 5.5859375, - "learning_rate": 4.461359173618914e-07, - "loss": 0.3496, - "num_tokens": 998232741.0, - "reward": 1.83740234375, - "reward_std": 0.5108038187026978, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.18874886631965637, + "grad_norm": 8.736144065856934, + "kl": 2.15625, + "learning_rate": 4.463581588581823e-07, + "loss": 0.1598, + "num_tokens": 1033378897.0, + "reward": 1.08154296875, + "reward_std": 0.28288325667381287, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.95263671875, + "rewards/tag_count_reward/std": 0.15188921988010406, "step": 1807 }, { @@ -52418,27 +52418,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 814.939453125, - "completions/mean_terminated_length": 764.8150024414062, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 804.943359375, + "completions/mean_terminated_length": 795.155517578125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.6172228386105658, - "grad_norm": 1.6330175399780273, - "kl": 6.30078125, - "learning_rate": 4.456143586505021e-07, - "loss": 0.3631, - "num_tokens": 998732806.0, - "reward": 1.79150390625, - "reward_std": 0.5776578187942505, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.21192020177841187, + "grad_norm": 4.6284589767456055, + "kl": 2.91796875, + "learning_rate": 4.4583633939769127e-07, + "loss": 0.2051, + "num_tokens": 1033873844.0, + "reward": 1.04638671875, + "reward_std": 0.3237868547439575, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.19132331013679504, "step": 1808 }, { @@ -52447,27 +52447,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 761.005859375, - "completions/mean_terminated_length": 719.4898681640625, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 725.69140625, + "completions/mean_terminated_length": 717.8978881835938, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.6175642229239566, - "grad_norm": 1.2343679666519165, - "kl": 6.296875, - "learning_rate": 4.450929480954383e-07, - "loss": 0.3686, - "num_tokens": 999194809.0, - "reward": 1.79248046875, - "reward_std": 0.5539897680282593, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.20248481631278992, + "grad_norm": 3.884291172027588, + "kl": 3.6328125, + "learning_rate": 4.453146678906571e-07, + "loss": 0.2772, + "num_tokens": 1034317766.0, + "reward": 1.01904296875, + "reward_std": 0.24419891834259033, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08812850713729858, + "rewards/tag_count_reward/mean": 0.92919921875, + "rewards/tag_count_reward/std": 0.1856968253850937, "step": 1809 }, { @@ -52476,27 +52476,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1890.0, - "completions/mean_length": 743.8515625, - "completions/mean_terminated_length": 704.4909057617188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1912.0, + "completions/max_terminated_length": 1912.0, + "completions/mean_length": 709.3359375, + "completions/mean_terminated_length": 709.3359375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.6179056072373474, - "grad_norm": 1.5132052898406982, - "kl": 5.5703125, - "learning_rate": 4.445716864367472e-07, - "loss": 0.3359, - "num_tokens": 999653501.0, - "reward": 1.921875, - "reward_std": 0.5488927960395813, - "rewards/accuracy_reward/mean": 0.12109375, - "rewards/accuracy_reward/std": 0.3265552520751953, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.1869896799325943, + "grad_norm": 5.181492805480957, + "kl": 3.8203125, + "learning_rate": 4.4479314507805856e-07, + "loss": 0.2254, + "num_tokens": 1034758786.0, + "reward": 1.11181640625, + "reward_std": 0.32107028365135193, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.1682516187429428, "step": 1810 }, { @@ -52505,27 +52505,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 714.630859375, - "completions/mean_terminated_length": 696.1485595703125, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 1847.0, + "completions/mean_length": 685.404296875, + "completions/mean_terminated_length": 671.9664916992188, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.6182469915507383, - "grad_norm": 2.2754740715026855, - "kl": 4.828125, - "learning_rate": 4.440505744142639e-07, - "loss": 0.2989, - "num_tokens": 1000096352.0, - "reward": 1.88818359375, - "reward_std": 0.4665547311306, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.16480305790901184, + "grad_norm": 6.824659824371338, + "kl": 3.77734375, + "learning_rate": 4.4427177170066387e-07, + "loss": 0.221, + "num_tokens": 1035186673.0, + "reward": 1.01904296875, + "reward_std": 0.20885957777500153, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.95068359375, + "rewards/tag_count_reward/std": 0.15287372469902039, "step": 1811 }, { @@ -52534,27 +52534,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1983.0, - "completions/mean_length": 770.412109375, - "completions/mean_terminated_length": 731.8530883789062, - "completions/min_length": 51.0, - "completions/min_terminated_length": 51.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 704.556640625, + "completions/mean_terminated_length": 701.9276123046875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.6185883758641291, - "grad_norm": 1.864904761314392, - "kl": 7.265625, - "learning_rate": 4.4352961276761183e-07, - "loss": 0.4279, - "num_tokens": 1000562099.0, - "reward": 1.8359375, - "reward_std": 0.5089847445487976, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.1835651993751526, + "grad_norm": 4.8679046630859375, + "kl": 2.96875, + "learning_rate": 4.4375054849902847e-07, + "loss": 0.1695, + "num_tokens": 1035618702.0, + "reward": 1.04638671875, + "reward_std": 0.21819032728672028, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.95458984375, + "rewards/tag_count_reward/std": 0.1399378776550293, "step": 1812 }, { @@ -52563,27 +52563,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1941.0, - "completions/mean_length": 731.8046875, - "completions/mean_terminated_length": 713.5604248046875, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/max_terminated_length": 1696.0, + "completions/mean_length": 679.759765625, + "completions/mean_terminated_length": 671.6954956054688, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, "epoch": 0.6189297601775199, - "grad_norm": 1.3424431085586548, - "kl": 5.5, - "learning_rate": 4.4300880223620063e-07, - "loss": 0.3374, - "num_tokens": 1001009471.0, - "reward": 1.8642578125, - "reward_std": 0.4546358585357666, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.17161767184734344, + "grad_norm": 2.781749963760376, + "kl": 3.62109375, + "learning_rate": 4.4322947621349517e-07, + "loss": 0.2388, + "num_tokens": 1036039427.0, + "reward": 1.0126953125, + "reward_std": 0.22760999202728271, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.15896911919116974, "step": 1813 }, { @@ -52592,27 +52592,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1961.0, - "completions/mean_length": 764.001953125, - "completions/mean_terminated_length": 733.18603515625, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_terminated_length": 1904.0, + "completions/mean_length": 739.45703125, + "completions/mean_terminated_length": 729.153564453125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.6192711444909107, - "grad_norm": 2.8516862392425537, - "kl": 5.8671875, - "learning_rate": 4.424881435592256e-07, - "loss": 0.3957, - "num_tokens": 1001477152.0, - "reward": 1.859375, - "reward_std": 0.47271546721458435, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.1726529598236084, + "grad_norm": 5.976024150848389, + "kl": 4.419921875, + "learning_rate": 4.427085555841915e-07, + "loss": 0.2495, + "num_tokens": 1036494541.0, + "reward": 0.9853515625, + "reward_std": 0.2786972224712372, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.19306157529354095, "step": 1814 }, { @@ -52621,27 +52621,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 850.126953125, - "completions/mean_terminated_length": 801.4329223632812, - "completions/min_length": 216.0, - "completions/min_terminated_length": 216.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 748.955078125, + "completions/mean_terminated_length": 738.7263793945312, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.6196125288043014, - "grad_norm": 1.2924460172653198, - "kl": 6.515625, - "learning_rate": 4.419676374756668e-07, - "loss": 0.4286, - "num_tokens": 1001991457.0, - "reward": 1.88134765625, - "reward_std": 0.5008226633071899, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, + "grad_norm": 3.3548057079315186, + "kl": 4.15234375, + "learning_rate": 4.4218778735103045e-07, + "loss": 0.2734, + "num_tokens": 1036957046.0, + "reward": 1.04345703125, + "reward_std": 0.32334092259407043, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.19549313187599182, + "rewards/tag_count_reward/std": 0.18053071200847626, "step": 1815 }, { @@ -52650,27 +52650,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 789.578125, - "completions/mean_terminated_length": 724.9774169921875, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/max_terminated_length": 1751.0, + "completions/mean_length": 701.875, + "completions/mean_terminated_length": 699.24072265625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.6199539131176922, - "grad_norm": 1.3212380409240723, - "kl": 7.484375, - "learning_rate": 4.4144728472428725e-07, - "loss": 0.4626, - "num_tokens": 1002470137.0, - "reward": 1.88623046875, - "reward_std": 0.5643240809440613, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.20477943122386932, + "grad_norm": 5.221735000610352, + "kl": 3.61328125, + "learning_rate": 4.416671722537081e-07, + "loss": 0.2002, + "num_tokens": 1037390822.0, + "reward": 1.1005859375, + "reward_std": 0.31636473536491394, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.005859375, + "rewards/format_reward/std": 0.07639661431312561, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.16208821535110474, "step": 1816 }, { @@ -52679,27 +52679,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 766.20703125, - "completions/mean_terminated_length": 700.4066162109375, - "completions/min_length": 7.0, - "completions/min_terminated_length": 7.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 706.587890625, + "completions/mean_terminated_length": 693.3590087890625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.620295297431083, - "grad_norm": 1.9985350370407104, - "kl": 8.0390625, - "learning_rate": 4.409270860436325e-07, - "loss": 0.494, - "num_tokens": 1002947075.0, - "reward": 1.79736328125, - "reward_std": 0.5326906442642212, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.21192020177841187, + "grad_norm": 3.937330961227417, + "kl": 4.12890625, + "learning_rate": 4.411467110317031e-07, + "loss": 0.2508, + "num_tokens": 1037837235.0, + "reward": 0.98828125, + "reward_std": 0.24135613441467285, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.1843547374010086, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.1684509813785553, "step": 1817 }, { @@ -52708,27 +52708,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 737.626953125, - "completions/mean_terminated_length": 692.624267578125, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 670.060546875, + "completions/mean_terminated_length": 659.2106323242188, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.6206366817444738, - "grad_norm": 2.6582906246185303, - "kl": 7.6328125, - "learning_rate": 4.404070421720293e-07, - "loss": 0.4606, - "num_tokens": 1003409060.0, - "reward": 1.85595703125, - "reward_std": 0.5522315502166748, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.20380185544490814, + "grad_norm": 3.414802312850952, + "kl": 3.91796875, + "learning_rate": 4.4062640442427534e-07, + "loss": 0.233, + "num_tokens": 1038264626.0, + "reward": 1.078125, + "reward_std": 0.2918456196784973, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.021484375, + "rewards/format_reward/std": 0.14513419568538666, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.17486365139484406, "step": 1818 }, { @@ -52737,27 +52737,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1700.0, - "completions/mean_length": 777.990234375, - "completions/mean_terminated_length": 750.1057739257812, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 727.689453125, + "completions/mean_terminated_length": 725.1056518554688, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.6209780660578647, - "grad_norm": 1.1062575578689575, - "kl": 4.875, - "learning_rate": 4.398871538475849e-07, - "loss": 0.3062, - "num_tokens": 1003889167.0, - "reward": 1.8896484375, - "reward_std": 0.4880656599998474, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.9482421875, - "rewards/tag_count_reward/std": 0.1610473245382309, + "grad_norm": 3.517559766769409, + "kl": 2.4765625, + "learning_rate": 4.401062531704658e-07, + "loss": 0.1416, + "num_tokens": 1038718979.0, + "reward": 1.05615234375, + "reward_std": 0.25288447737693787, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.95849609375, + "rewards/tag_count_reward/std": 0.1385241001844406, "step": 1819 }, { @@ -52766,27 +52766,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 829.810546875, - "completions/mean_terminated_length": 759.3367309570312, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_terminated_length": 1812.0, + "completions/mean_length": 742.3515625, + "completions/mean_terminated_length": 734.6561889648438, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.6213194503712555, - "grad_norm": 1.053416132926941, - "kl": 7.3125, - "learning_rate": 4.3936742180818565e-07, - "loss": 0.4901, - "num_tokens": 1004390254.0, - "reward": 1.8740234375, - "reward_std": 0.5669934153556824, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.20369693636894226, + "grad_norm": 3.004946231842041, + "kl": 3.15625, + "learning_rate": 4.3958625800909365e-07, + "loss": 0.2226, + "num_tokens": 1039175287.0, + "reward": 1.07666015625, + "reward_std": 0.3101526200771332, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15143637359142303, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.17868997156620026, "step": 1820 }, { @@ -52795,27 +52795,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 806.140625, - "completions/mean_terminated_length": 768.659912109375, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 760.146484375, + "completions/mean_terminated_length": 742.2950439453125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.6216608346846463, - "grad_norm": 1.6496962308883667, - "kl": 5.89453125, - "learning_rate": 4.3884784679149613e-07, - "loss": 0.3358, - "num_tokens": 1004888854.0, - "reward": 1.81494140625, - "reward_std": 0.5355316996574402, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.2032901644706726, + "grad_norm": 5.341542720794678, + "kl": 2.9765625, + "learning_rate": 4.3906641967875747e-07, + "loss": 0.2179, + "num_tokens": 1039650338.0, + "reward": 1.02197265625, + "reward_std": 0.2661270499229431, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15143637359142303, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.17398646473884583, "step": 1821 }, { @@ -52824,27 +52824,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1969.0, - "completions/mean_length": 825.744140625, - "completions/mean_terminated_length": 778.638916015625, - "completions/min_length": 187.0, - "completions/min_terminated_length": 187.0, + "completions/max_terminated_length": 1868.0, + "completions/mean_length": 781.55078125, + "completions/mean_terminated_length": 779.0723876953125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.6220022189980371, - "grad_norm": 1.900420069694519, - "kl": 6.1875, - "learning_rate": 4.383284295349583e-07, - "loss": 0.3893, - "num_tokens": 1005392083.0, - "reward": 1.81201171875, - "reward_std": 0.5551861524581909, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.1948370635509491, + "grad_norm": 1.6218445301055908, + "kl": 2.03515625, + "learning_rate": 4.3854673891783224e-07, + "loss": 0.1122, + "num_tokens": 1040130940.0, + "reward": 1.0263671875, + "reward_std": 0.24079710245132446, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.9501953125, + "rewards/tag_count_reward/std": 0.15470431745052338, "step": 1822 }, { @@ -52853,27 +52853,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 800.244140625, - "completions/mean_terminated_length": 757.3919677734375, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/max_terminated_length": 1891.0, + "completions/mean_length": 714.837890625, + "completions/mean_terminated_length": 704.340576171875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.6223436033114278, - "grad_norm": 1.5868628025054932, - "kl": 6.7421875, - "learning_rate": 4.378091707757896e-07, - "loss": 0.4093, - "num_tokens": 1005881280.0, - "reward": 1.8447265625, - "reward_std": 0.6214015483856201, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.21457123756408691, + "grad_norm": 6.91715145111084, + "kl": 2.544921875, + "learning_rate": 4.3802721646446985e-07, + "loss": 0.1802, + "num_tokens": 1040576409.0, + "reward": 1.08349609375, + "reward_std": 0.3270333409309387, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.10772226005792618, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.16999182105064392, "step": 1823 }, { @@ -52882,27 +52882,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 834.66796875, - "completions/mean_terminated_length": 767.1217041015625, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1917.0, + "completions/max_terminated_length": 1917.0, + "completions/mean_length": 723.931640625, + "completions/mean_terminated_length": 723.931640625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.6226849876248186, - "grad_norm": 1.5598671436309814, - "kl": 6.4296875, - "learning_rate": 4.372900712509831e-07, - "loss": 0.3905, - "num_tokens": 1006383382.0, - "reward": 1.81005859375, - "reward_std": 0.5211690664291382, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20882701873779297, + "grad_norm": 2.1370882987976074, + "kl": 1.375, + "learning_rate": 4.375078530565967e-07, + "loss": 0.0735, + "num_tokens": 1041021814.0, + "reward": 1.072265625, + "reward_std": 0.27255967259407043, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.021484375, + "rewards/format_reward/std": 0.14513419568538666, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.13701993227005005, "step": 1824 }, { @@ -52911,27 +52911,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 746.78515625, - "completions/mean_terminated_length": 702.0969848632812, - "completions/min_length": 17.0, - "completions/min_terminated_length": 17.0, + "completions/max_terminated_length": 1839.0, + "completions/mean_length": 686.6875, + "completions/mean_terminated_length": 684.0234985351562, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.6230263719382094, - "grad_norm": 1.1952095031738281, - "kl": 6.10546875, - "learning_rate": 4.367711316973054e-07, - "loss": 0.3898, - "num_tokens": 1006836584.0, - "reward": 1.86181640625, - "reward_std": 0.4903917908668518, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.18737702071666718, + "grad_norm": 3.462764263153076, + "kl": 1.99609375, + "learning_rate": 4.369886494319137e-07, + "loss": 0.1253, + "num_tokens": 1041444246.0, + "reward": 1.08203125, + "reward_std": 0.27302008867263794, + "rewards/accuracy_reward/mean": 0.11895161122083664, + "rewards/accuracy_reward/std": 0.3240584135055542, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.13493992388248444, "step": 1825 }, { @@ -52940,27 +52940,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 799.009765625, - "completions/mean_terminated_length": 758.7197265625, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 766.158203125, + "completions/mean_terminated_length": 758.6031494140625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.6233677562516002, - "grad_norm": 1.1136211156845093, - "kl": 6.1484375, - "learning_rate": 4.3625235285129634e-07, - "loss": 0.3822, - "num_tokens": 1007322141.0, - "reward": 1.91552734375, - "reward_std": 0.5885144472122192, - "rewards/accuracy_reward/mean": 0.138671875, - "rewards/accuracy_reward/std": 0.34594178199768066, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19574713706970215, + "grad_norm": 5.409436225891113, + "kl": 2.724609375, + "learning_rate": 4.3646960632789444e-07, + "loss": 0.183, + "num_tokens": 1041912983.0, + "reward": 1.125, + "reward_std": 0.3496522903442383, + "rewards/accuracy_reward/mean": 0.169921875, + "rewards/accuracy_reward/std": 0.3759314715862274, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.935546875, + "rewards/tag_count_reward/std": 0.17693878710269928, "step": 1826 }, { @@ -52969,27 +52969,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 823.900390625, - "completions/mean_terminated_length": 758.41357421875, - "completions/min_length": 44.0, - "completions/min_terminated_length": 44.0, + "completions/max_terminated_length": 1775.0, + "completions/mean_length": 751.421875, + "completions/mean_terminated_length": 741.2125854492188, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.623709140564991, - "grad_norm": 1.2269648313522339, - "kl": 7.3359375, - "learning_rate": 4.3573373544926786e-07, - "loss": 0.4647, - "num_tokens": 1007818730.0, - "reward": 1.80322265625, - "reward_std": 0.6270456314086914, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.221117302775383, + "grad_norm": 3.115943193435669, + "kl": 2.564453125, + "learning_rate": 4.3595072448178505e-07, + "loss": 0.1921, + "num_tokens": 1042372463.0, + "reward": 1.0478515625, + "reward_std": 0.27128899097442627, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.009765625, + "rewards/format_reward/std": 0.09843364357948303, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.17378656566143036, "step": 1827 }, { @@ -52998,27 +52998,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 821.3515625, - "completions/mean_terminated_length": 771.48779296875, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 765.251953125, + "completions/mean_terminated_length": 760.2216186523438, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.6240505248783819, - "grad_norm": 1.0753824710845947, - "kl": 7.8203125, - "learning_rate": 4.352152802273024e-07, - "loss": 0.4759, - "num_tokens": 1008320846.0, - "reward": 1.8017578125, - "reward_std": 0.5702003240585327, - "rewards/accuracy_reward/mean": 0.0786290317773819, - "rewards/accuracy_reward/std": 0.26943066716194153, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.9052734375, - "rewards/tag_count_reward/std": 0.21572597324848175, + "grad_norm": 6.244734764099121, + "kl": 2.486328125, + "learning_rate": 4.3543200463060183e-07, + "loss": 0.1403, + "num_tokens": 1042845856.0, + "reward": 1.09375, + "reward_std": 0.25592657923698425, + "rewards/accuracy_reward/mean": 0.11491935700178146, + "rewards/accuracy_reward/std": 0.3192465901374817, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.1467188447713852, "step": 1828 }, { @@ -53027,27 +53027,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 831.482421875, - "completions/mean_terminated_length": 789.7030639648438, - "completions/min_length": 74.0, - "completions/min_terminated_length": 74.0, + "completions/max_terminated_length": 1667.0, + "completions/mean_length": 741.560546875, + "completions/mean_terminated_length": 736.4373168945312, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.6243919091917727, - "grad_norm": 1.256855845451355, - "kl": 5.09375, - "learning_rate": 4.3469698792125196e-07, - "loss": 0.3163, - "num_tokens": 1008822181.0, - "reward": 1.82080078125, - "reward_std": 0.48862963914871216, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.18399609625339508, + "grad_norm": 2.166569232940674, + "kl": 2.05078125, + "learning_rate": 4.349134475111319e-07, + "loss": 0.095, + "num_tokens": 1043301151.0, + "reward": 0.99951171875, + "reward_std": 0.2247733622789383, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17416280508041382, + "rewards/format_reward/mean": 0.021484375, + "rewards/format_reward/std": 0.14513419568538666, + "rewards/tag_count_reward/mean": 0.94677734375, + "rewards/tag_count_reward/std": 0.16170679032802582, "step": 1829 }, { @@ -53056,27 +53056,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 788.90625, - "completions/mean_terminated_length": 758.6880493164062, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 725.310546875, + "completions/mean_terminated_length": 722.7221069335938, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.6247332935051635, - "grad_norm": 2.1441006660461426, - "kl": 6.0390625, - "learning_rate": 4.341788592667381e-07, - "loss": 0.3577, - "num_tokens": 1009302837.0, - "reward": 1.83056640625, - "reward_std": 0.5533666610717773, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.19362683594226837, + "grad_norm": 3.723933458328247, + "kl": 2.40234375, + "learning_rate": 4.343950538599305e-07, + "loss": 0.1616, + "num_tokens": 1043749246.0, + "reward": 1.09619140625, + "reward_std": 0.2847236394882202, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15143637359142303, + "rewards/tag_count_reward/mean": 0.95556640625, + "rewards/tag_count_reward/std": 0.14789213240146637, "step": 1830 }, { @@ -53085,27 +53085,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1890.0, - "completions/mean_length": 750.34765625, - "completions/mean_terminated_length": 716.5410766601562, - "completions/min_length": 192.0, - "completions/min_terminated_length": 192.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 713.478515625, + "completions/mean_terminated_length": 708.2451171875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.6250746778185542, - "grad_norm": 0.9880239367485046, - "kl": 4.8359375, - "learning_rate": 4.336608949991496e-07, - "loss": 0.2965, - "num_tokens": 1009770151.0, - "reward": 1.9775390625, - "reward_std": 0.5535204410552979, - "rewards/accuracy_reward/mean": 0.16796875, - "rewards/accuracy_reward/std": 0.374204158782959, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.17934982478618622, + "grad_norm": 2.8965001106262207, + "kl": 2.599609375, + "learning_rate": 4.338768244133212e-07, + "loss": 0.165, + "num_tokens": 1044197683.0, + "reward": 1.14453125, + "reward_std": 0.31257838010787964, + "rewards/accuracy_reward/mean": 0.181640625, + "rewards/accuracy_reward/std": 0.38592514395713806, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.16385014355182648, "step": 1831 }, { @@ -53114,27 +53114,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 854.609375, - "completions/mean_terminated_length": 793.3470458984375, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 789.892578125, + "completions/mean_terminated_length": 779.9862060546875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.625416062131945, - "grad_norm": 1.020838737487793, - "kl": 6.22265625, - "learning_rate": 4.3314309585364185e-07, - "loss": 0.3637, - "num_tokens": 1010290687.0, - "reward": 1.85107421875, - "reward_std": 0.5193954706192017, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.20463472604751587, + "grad_norm": 4.079098224639893, + "kl": 3.57421875, + "learning_rate": 4.333587599073937e-07, + "loss": 0.2137, + "num_tokens": 1044685084.0, + "reward": 1.04931640625, + "reward_std": 0.28461208939552307, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.1788075864315033, "step": 1832 }, { @@ -53143,27 +53143,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 770.44140625, - "completions/mean_terminated_length": 755.2925415039062, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/max_terminated_length": 1891.0, + "completions/mean_length": 731.41015625, + "completions/mean_terminated_length": 723.6503295898438, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.6257574464453358, - "grad_norm": 1.6976969242095947, - "kl": 4.18359375, - "learning_rate": 4.3262546256513613e-07, - "loss": 0.2874, - "num_tokens": 1010764705.0, - "reward": 1.9326171875, - "reward_std": 0.48232823610305786, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.9482421875, - "rewards/tag_count_reward/std": 0.16330981254577637, + "grad_norm": 4.378573417663574, + "kl": 4.1640625, + "learning_rate": 4.328408610780041e-07, + "loss": 0.2409, + "num_tokens": 1045139118.0, + "reward": 1.0654296875, + "reward_std": 0.3101952075958252, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.1889999359846115, "step": 1833 }, { @@ -53172,27 +53172,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 862.01171875, - "completions/mean_terminated_length": 816.3042602539062, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 772.3671875, + "completions/mean_terminated_length": 759.7869873046875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.6260988307587266, - "grad_norm": 1.7305068969726562, - "kl": 5.62890625, - "learning_rate": 4.3210799586831825e-07, - "loss": 0.3433, - "num_tokens": 1011283879.0, - "reward": 1.79931640625, - "reward_std": 0.544184148311615, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.1946849673986435, + "grad_norm": 3.3213179111480713, + "kl": 3.4296875, + "learning_rate": 4.323231286607727e-07, + "loss": 0.2083, + "num_tokens": 1045612394.0, + "reward": 1.0009765625, + "reward_std": 0.2565445303916931, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.17864517867565155, "step": 1834 }, { @@ -53201,27 +53201,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1917.0, - "completions/mean_length": 850.76953125, - "completions/mean_terminated_length": 797.0162963867188, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 789.412109375, + "completions/mean_terminated_length": 779.501953125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.6264402150721174, - "grad_norm": 1.5599923133850098, - "kl": 6.9375, - "learning_rate": 4.3159069649763747e-07, - "loss": 0.4452, - "num_tokens": 1011796913.0, - "reward": 1.828125, - "reward_std": 0.5796926021575928, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.2087314873933792, + "grad_norm": 6.338538646697998, + "kl": 4.953125, + "learning_rate": 4.3180556339108385e-07, + "loss": 0.3012, + "num_tokens": 1046094013.0, + "reward": 1.0537109375, + "reward_std": 0.3417166769504547, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.19093148410320282, "step": 1835 }, { @@ -53230,27 +53230,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 829.044921875, - "completions/mean_terminated_length": 799.7900390625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1988.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 743.64453125, + "completions/mean_terminated_length": 743.64453125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6267815993855083, - "grad_norm": 0.9597150683403015, - "kl": 6.125, - "learning_rate": 4.3107356518730564e-07, - "loss": 0.3887, - "num_tokens": 1012307032.0, - "reward": 1.84912109375, - "reward_std": 0.528598427772522, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.1749831587076187, + "grad_norm": 4.289685249328613, + "kl": 2.587890625, + "learning_rate": 4.3128816600408393e-07, + "loss": 0.1294, + "num_tokens": 1046560407.0, + "reward": 1.05224609375, + "reward_std": 0.24506792426109314, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.17416280508041382, + "rewards/tag_count_reward/mean": 0.95654296875, + "rewards/tag_count_reward/std": 0.1456853747367859, "step": 1836 }, { @@ -53259,27 +53259,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1897.0, - "completions/mean_length": 825.412109375, - "completions/mean_terminated_length": 780.8643798828125, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 772.03515625, + "completions/mean_terminated_length": 769.5381469726562, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, "epoch": 0.6271229836988991, - "grad_norm": 1.6486892700195312, - "kl": 6.0703125, - "learning_rate": 4.30556602671296e-07, - "loss": 0.3711, - "num_tokens": 1012802251.0, - "reward": 1.85693359375, - "reward_std": 0.5106162428855896, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19079817831516266, + "grad_norm": 3.7972774505615234, + "kl": 3.54296875, + "learning_rate": 4.307709372346816e-07, + "loss": 0.1951, + "num_tokens": 1047028297.0, + "reward": 1.01904296875, + "reward_std": 0.30611443519592285, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15143637359142303, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.19357748329639435, "step": 1837 }, { @@ -53288,27 +53288,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 832.314453125, - "completions/mean_terminated_length": 772.526611328125, - "completions/min_length": 82.0, - "completions/min_terminated_length": 82.0, + "completions/max_terminated_length": 1657.0, + "completions/mean_length": 758.759765625, + "completions/mean_terminated_length": 748.6082763671875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.6274643680122899, - "grad_norm": 2.0939388275146484, - "kl": 7.7578125, - "learning_rate": 4.300398096833424e-07, - "loss": 0.4827, - "num_tokens": 1013303324.0, - "reward": 1.84375, - "reward_std": 0.5772364735603333, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.20931662619113922, + "grad_norm": 2.0366618633270264, + "kl": 3.2109375, + "learning_rate": 4.302538778175452e-07, + "loss": 0.2163, + "num_tokens": 1047491710.0, + "reward": 1.0625, + "reward_std": 0.28369539976119995, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.17416280508041382, + "rewards/tag_count_reward/mean": 0.947265625, + "rewards/tag_count_reward/std": 0.1607295721769333, "step": 1838 }, { @@ -53317,27 +53317,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 775.90234375, - "completions/mean_terminated_length": 745.3720092773438, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1965.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 751.966796875, + "completions/mean_terminated_length": 751.966796875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, "epoch": 0.6278057523256806, - "grad_norm": 2.218848943710327, - "kl": 6.140625, - "learning_rate": 4.2952318695693803e-07, - "loss": 0.3746, - "num_tokens": 1013777194.0, - "reward": 1.84228515625, - "reward_std": 0.561437726020813, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.1902967095375061, + "grad_norm": 6.083302974700928, + "kl": 1.662109375, + "learning_rate": 4.2973698848710293e-07, + "loss": 0.0744, + "num_tokens": 1047953325.0, + "reward": 1.1044921875, + "reward_std": 0.272316575050354, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.9638671875, + "rewards/tag_count_reward/std": 0.1286429464817047, "step": 1839 }, { @@ -53346,27 +53346,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 789.443359375, - "completions/mean_terminated_length": 738.282470703125, - "completions/min_length": 80.0, - "completions/min_terminated_length": 80.0, + "completions/max_terminated_length": 1805.0, + "completions/mean_length": 733.283203125, + "completions/mean_terminated_length": 725.534423828125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.6281471366390714, - "grad_norm": 4.587276935577393, - "kl": 8.5234375, - "learning_rate": 4.290067352253344e-07, - "loss": 0.4924, - "num_tokens": 1014250861.0, - "reward": 1.7724609375, - "reward_std": 0.5534372329711914, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.20996157824993134, + "grad_norm": 4.441151142120361, + "kl": 2.404296875, + "learning_rate": 4.2922026997754156e-07, + "loss": 0.1194, + "num_tokens": 1048398238.0, + "reward": 1.0546875, + "reward_std": 0.2719019949436188, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15143637359142303, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.16393177211284637, "step": 1840 }, { @@ -53375,27 +53375,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 776.703125, - "completions/mean_terminated_length": 740.9638061523438, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 756.84765625, + "completions/mean_terminated_length": 754.3209228515625, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.6284885209524622, - "grad_norm": 3.668210029602051, - "kl": 7.21875, - "learning_rate": 4.2849045522153994e-07, - "loss": 0.4118, - "num_tokens": 1014729573.0, - "reward": 1.7890625, - "reward_std": 0.5655283331871033, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.1907537281513214, + "grad_norm": 3.5273263454437256, + "kl": 2.5546875, + "learning_rate": 4.28703723022805e-07, + "loss": 0.1617, + "num_tokens": 1048866784.0, + "reward": 1.0771484375, + "reward_std": 0.3079281449317932, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.16951124370098114, "step": 1841 }, { @@ -53404,27 +53404,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 837.736328125, - "completions/mean_terminated_length": 788.53857421875, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 826.09375, + "completions/mean_terminated_length": 823.7025146484375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, "epoch": 0.628829905265853, - "grad_norm": 5.375893592834473, - "kl": 8.25, - "learning_rate": 4.2797434767831985e-07, - "loss": 0.453, - "num_tokens": 1015235342.0, - "reward": 1.7314453125, - "reward_std": 0.6241041421890259, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.76953125, - "rewards/format_reward/std": 0.42154473066329956, - "rewards/tag_count_reward/mean": 0.8935546875, - "rewards/tag_count_reward/std": 0.21421466767787933, + "grad_norm": 5.309321403503418, + "kl": 2.126953125, + "learning_rate": 4.2818734835659355e-07, + "loss": 0.1128, + "num_tokens": 1049366592.0, + "reward": 1.076171875, + "reward_std": 0.31113630533218384, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.18369008600711823, "step": 1842 }, { @@ -53433,27 +53433,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 783.0625, - "completions/mean_terminated_length": 736.9716796875, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1779.0, + "completions/max_terminated_length": 1779.0, + "completions/mean_length": 787.052734375, + "completions/mean_terminated_length": 787.052734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, "epoch": 0.6291712895792438, - "grad_norm": 1.8987025022506714, - "kl": 6.8359375, - "learning_rate": 4.274584133281944e-07, - "loss": 0.418, - "num_tokens": 1015703998.0, - "reward": 1.8291015625, - "reward_std": 0.5494622588157654, - "rewards/accuracy_reward/mean": 0.05645161122083664, - "rewards/accuracy_reward/std": 0.23102475702762604, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.18873685598373413, + "grad_norm": 2.3830864429473877, + "kl": 1.587890625, + "learning_rate": 4.276711467123628e-07, + "loss": 0.0955, + "num_tokens": 1049837291.0, + "reward": 1.08837890625, + "reward_std": 0.2593773305416107, + "rewards/accuracy_reward/mean": 0.0927419364452362, + "rewards/accuracy_reward/std": 0.2903633117675781, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.96337890625, + "rewards/tag_count_reward/std": 0.13179369270801544, "step": 1843 }, { @@ -53462,27 +53462,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1935.0, - "completions/mean_length": 771.59375, - "completions/mean_terminated_length": 719.707275390625, - "completions/min_length": 60.0, - "completions/min_terminated_length": 60.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 718.642578125, + "completions/mean_terminated_length": 716.0410766601562, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.6295126738926347, - "grad_norm": 1.2763030529022217, - "kl": 6.5234375, - "learning_rate": 4.269426529034382e-07, - "loss": 0.4324, - "num_tokens": 1016175038.0, - "reward": 1.85546875, - "reward_std": 0.5567238330841064, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.19597215950489044, + "grad_norm": 3.440014123916626, + "kl": 2.0625, + "learning_rate": 4.271551188233224e-07, + "loss": 0.0945, + "num_tokens": 1050281220.0, + "reward": 1.1240234375, + "reward_std": 0.284148246049881, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.9638671875, + "rewards/tag_count_reward/std": 0.13146430253982544, "step": 1844 }, { @@ -53491,27 +53491,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1772.0, - "completions/mean_length": 727.19921875, - "completions/mean_terminated_length": 700.8884887695312, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 1828.0, + "completions/mean_length": 733.169921875, + "completions/mean_terminated_length": 730.5968627929688, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, "epoch": 0.6298540582060255, - "grad_norm": 3.3444409370422363, - "kl": 4.765625, - "learning_rate": 4.264270671360787e-07, - "loss": 0.2975, - "num_tokens": 1016619300.0, - "reward": 1.84521484375, - "reward_std": 0.5264946818351746, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.1884550154209137, + "grad_norm": 6.286828994750977, + "kl": 1.501953125, + "learning_rate": 4.2663926542243555e-07, + "loss": 0.098, + "num_tokens": 1050728539.0, + "reward": 1.07568359375, + "reward_std": 0.2769525647163391, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15143637359142303, + "rewards/tag_count_reward/mean": 0.94677734375, + "rewards/tag_count_reward/std": 0.16691738367080688, "step": 1845 }, { @@ -53520,27 +53520,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 775.34375, - "completions/mean_terminated_length": 739.5662231445312, - "completions/min_length": 101.0, - "completions/min_terminated_length": 101.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 801.595703125, + "completions/mean_terminated_length": 799.1565551757812, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.6301954425194163, - "grad_norm": 1.9668244123458862, - "kl": 4.640625, - "learning_rate": 4.2591165675789555e-07, - "loss": 0.3106, - "num_tokens": 1017097236.0, - "reward": 1.91455078125, - "reward_std": 0.5258707404136658, - "rewards/accuracy_reward/mean": 0.11290322244167328, - "rewards/accuracy_reward/std": 0.3167939782142639, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.17468255758285522, + "grad_norm": 3.196523666381836, + "kl": 2.423828125, + "learning_rate": 4.261235872424173e-07, + "loss": 0.1337, + "num_tokens": 1051219916.0, + "reward": 1.1005859375, + "reward_std": 0.31296324729919434, + "rewards/accuracy_reward/mean": 0.1270161271095276, + "rewards/accuracy_reward/std": 0.3333272337913513, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.9482421875, + "rewards/tag_count_reward/std": 0.1548524796962738, "step": 1846 }, { @@ -53549,27 +53549,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 769.189453125, - "completions/mean_terminated_length": 748.8909301757812, - "completions/min_length": 213.0, - "completions/min_terminated_length": 213.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 754.048828125, + "completions/mean_terminated_length": 748.9745483398438, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.630536826832807, - "grad_norm": 2.781959295272827, - "kl": 4.33203125, - "learning_rate": 4.2539642250041973e-07, - "loss": 0.2909, - "num_tokens": 1017562949.0, - "reward": 1.9033203125, - "reward_std": 0.4896920323371887, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.9462890625, - "rewards/tag_count_reward/std": 0.16342678666114807, + "grad_norm": 2.4521005153656006, + "kl": 1.791015625, + "learning_rate": 4.2560808501573395e-07, + "loss": 0.0724, + "num_tokens": 1051677877.0, + "reward": 1.076171875, + "reward_std": 0.29641374945640564, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.021484375, + "rewards/format_reward/std": 0.14513419568538666, + "rewards/tag_count_reward/mean": 0.955078125, + "rewards/tag_count_reward/std": 0.14979958534240723, "step": 1847 }, { @@ -53578,27 +53578,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1936.0, - "completions/mean_length": 783.32421875, - "completions/mean_terminated_length": 734.5841674804688, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1663.0, + "completions/max_terminated_length": 1663.0, + "completions/mean_length": 738.46875, + "completions/mean_terminated_length": 738.46875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.6308782111461978, - "grad_norm": 2.7658309936523438, - "kl": 5.9921875, - "learning_rate": 4.2488136509493165e-07, - "loss": 0.4025, - "num_tokens": 1018037451.0, - "reward": 1.82275390625, - "reward_std": 0.5239130258560181, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.20123985409736633, + "grad_norm": 1.9718989133834839, + "kl": 1.6494140625, + "learning_rate": 4.250927594746018e-07, + "loss": 0.0816, + "num_tokens": 1052129413.0, + "reward": 1.04443359375, + "reward_std": 0.28246134519577026, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.95068359375, + "rewards/tag_count_reward/std": 0.15446557104587555, "step": 1848 }, { @@ -53607,27 +53607,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 759.548828125, - "completions/mean_terminated_length": 715.2990112304688, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 808.255859375, + "completions/mean_terminated_length": 793.5553588867188, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.6312195954595886, - "grad_norm": 2.1002485752105713, - "kl": 5.515625, - "learning_rate": 4.243664852724614e-07, - "loss": 0.3747, - "num_tokens": 1018505444.0, - "reward": 1.8642578125, - "reward_std": 0.5158007144927979, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.17795921862125397, + "grad_norm": 5.427484512329102, + "kl": 2.71484375, + "learning_rate": 4.2457761135098644e-07, + "loss": 0.1676, + "num_tokens": 1052622344.0, + "reward": 1.041015625, + "reward_std": 0.31342387199401855, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.189265176653862, "step": 1849 }, { @@ -53636,27 +53636,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 802.892578125, - "completions/mean_terminated_length": 746.98974609375, - "completions/min_length": 60.0, - "completions/min_terminated_length": 60.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1991.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 756.81640625, + "completions/mean_terminated_length": 756.81640625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.6315609797729794, - "grad_norm": 1.6585819721221924, - "kl": 6.41015625, - "learning_rate": 4.238517837637866e-07, - "loss": 0.3989, - "num_tokens": 1019012141.0, - "reward": 1.87451171875, - "reward_std": 0.501410961151123, + "grad_norm": 2.435065984725952, + "kl": 1.845703125, + "learning_rate": 4.2406264137660106e-07, + "loss": 0.0788, + "num_tokens": 1053105450.0, + "reward": 1.06103515625, + "reward_std": 0.2741588056087494, "rewards/accuracy_reward/mean": 0.08064515888690948, "rewards/accuracy_reward/std": 0.2725643217563629, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.18960754573345184, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.94970703125, + "rewards/tag_count_reward/std": 0.15806719660758972, "step": 1850 }, { @@ -53665,27 +53665,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1970.0, - "completions/mean_length": 792.677734375, - "completions/mean_terminated_length": 752.1834716796875, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 766.275390625, + "completions/mean_terminated_length": 758.7210693359375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.6319023640863702, - "grad_norm": 1.6555026769638062, - "kl": 6.640625, - "learning_rate": 4.233372612994319e-07, - "loss": 0.4092, - "num_tokens": 1019495976.0, - "reward": 1.85302734375, - "reward_std": 0.5476795434951782, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20228178799152374, + "grad_norm": 2.4755587577819824, + "kl": 2.18359375, + "learning_rate": 4.235478502829062e-07, + "loss": 0.1645, + "num_tokens": 1053575767.0, + "reward": 1.1064453125, + "reward_std": 0.34935909509658813, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.021484375, + "rewards/format_reward/std": 0.14513419568538666, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.17521031200885773, "step": 1851 }, { @@ -53694,27 +53694,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 859.576171875, - "completions/mean_terminated_length": 816.2733154296875, - "completions/min_length": 55.0, - "completions/min_terminated_length": 55.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 822.345703125, + "completions/mean_terminated_length": 815.121826171875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, "epoch": 0.632243748399761, - "grad_norm": 1.5367271900177002, - "kl": 6.8203125, - "learning_rate": 4.228229186096679e-07, - "loss": 0.3944, - "num_tokens": 1020011855.0, - "reward": 1.79443359375, - "reward_std": 0.5380985140800476, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.21844784915447235, + "grad_norm": 4.414604187011719, + "kl": 2.36328125, + "learning_rate": 4.23033238801108e-07, + "loss": 0.1116, + "num_tokens": 1054072584.0, + "reward": 1.08203125, + "reward_std": 0.2705024480819702, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.951171875, + "rewards/tag_count_reward/std": 0.15182708203792572, "step": 1852 }, { @@ -53723,27 +53723,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 805.333984375, - "completions/mean_terminated_length": 752.1853637695312, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 1836.0, + "completions/mean_length": 774.818359375, + "completions/mean_terminated_length": 767.3143920898438, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, "epoch": 0.6325851327131519, - "grad_norm": 1.0009610652923584, - "kl": 6.8828125, - "learning_rate": 4.223087564245099e-07, - "loss": 0.4441, - "num_tokens": 1020514202.0, - "reward": 1.79931640625, - "reward_std": 0.5320873856544495, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.20425614714622498, + "grad_norm": 2.869246006011963, + "kl": 3.03125, + "learning_rate": 4.2251880766215764e-07, + "loss": 0.2154, + "num_tokens": 1054559307.0, + "reward": 1.005859375, + "reward_std": 0.2485184371471405, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.1698969453573227, "step": 1853 }, { @@ -53752,27 +53752,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 892.333984375, - "completions/mean_terminated_length": 850.2247314453125, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 834.44140625, + "completions/mean_terminated_length": 827.288818359375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.6329265170265427, - "grad_norm": 1.004145622253418, - "kl": 5.7109375, - "learning_rate": 4.2179477547371713e-07, - "loss": 0.3546, - "num_tokens": 1021045461.0, - "reward": 1.82275390625, - "reward_std": 0.5507851839065552, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.19940820336341858, + "grad_norm": 2.5125839710235596, + "kl": 2.43359375, + "learning_rate": 4.220045575967499e-07, + "loss": 0.1682, + "num_tokens": 1055060925.0, + "reward": 1.0732421875, + "reward_std": 0.2804592251777649, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.9501953125, + "rewards/tag_count_reward/std": 0.15069948136806488, "step": 1854 }, { @@ -53781,27 +53781,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 768.025390625, - "completions/mean_terminated_length": 734.6793823242188, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 1717.0, + "completions/mean_length": 730.72265625, + "completions/mean_terminated_length": 725.556884765625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.6332679013399334, - "grad_norm": 0.9820337295532227, - "kl": 6.5859375, - "learning_rate": 4.212809764867915e-07, - "loss": 0.4173, - "num_tokens": 1021515906.0, - "reward": 1.83056640625, - "reward_std": 0.5448594689369202, + "grad_norm": 2.4686412811279297, + "kl": 3.24609375, + "learning_rate": 4.2149048933532306e-07, + "loss": 0.2033, + "num_tokens": 1055512271.0, + "reward": 1.0146484375, + "reward_std": 0.24637790024280548, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.19879388809204102, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.17660093307495117, "step": 1855 }, { @@ -53810,27 +53810,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 792.013671875, - "completions/mean_terminated_length": 746.2490234375, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/max_terminated_length": 1752.0, + "completions/mean_length": 752.13671875, + "completions/mean_terminated_length": 747.054931640625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, "epoch": 0.6336092856533242, - "grad_norm": 1.5900251865386963, - "kl": 7.6484375, - "learning_rate": 4.2076736019297674e-07, - "loss": 0.4744, - "num_tokens": 1021996073.0, - "reward": 1.8427734375, - "reward_std": 0.5668472647666931, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.19635941088199615, + "grad_norm": 6.214570999145508, + "kl": 4.00390625, + "learning_rate": 4.209766036080562e-07, + "loss": 0.2067, + "num_tokens": 1055972021.0, + "reward": 1.037109375, + "reward_std": 0.30608439445495605, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.021484375, + "rewards/format_reward/std": 0.14513419568538666, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.18652920424938202, "step": 1856 }, { @@ -53839,27 +53839,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 750.109375, - "completions/mean_terminated_length": 697.3495483398438, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 722.705078125, + "completions/mean_terminated_length": 714.8939208984375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, "epoch": 0.633950669966715, - "grad_norm": 1.420174479484558, - "kl": 5.787109375, - "learning_rate": 4.202539273212572e-07, - "loss": 0.3654, - "num_tokens": 1022461153.0, - "reward": 1.83740234375, - "reward_std": 0.5509225130081177, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.2005549520254135, + "grad_norm": 11.045815467834473, + "kl": 4.73828125, + "learning_rate": 4.2046290114486993e-07, + "loss": 0.2426, + "num_tokens": 1056423070.0, + "reward": 1.052734375, + "reward_std": 0.3256292939186096, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.1910640448331833, "step": 1857 }, { @@ -53868,27 +53868,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 773.87890625, - "completions/mean_terminated_length": 722.0853271484375, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 769.69921875, + "completions/mean_terminated_length": 751.980224609375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.6342920542801058, - "grad_norm": 1.0452171564102173, - "kl": 6.46875, - "learning_rate": 4.197406786003569e-07, - "loss": 0.4259, - "num_tokens": 1022933731.0, - "reward": 1.83984375, - "reward_std": 0.5241327881813049, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.2059757262468338, + "grad_norm": 3.6696245670318604, + "kl": 4.578125, + "learning_rate": 4.19949382675424e-07, + "loss": 0.2662, + "num_tokens": 1056893508.0, + "reward": 1.0439453125, + "reward_std": 0.30018579959869385, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.17913658916950226, "step": 1858 }, { @@ -53897,27 +53897,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 769.501953125, - "completions/mean_terminated_length": 728.2600708007812, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 669.791015625, + "completions/mean_terminated_length": 658.93896484375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.6346334385934966, - "grad_norm": 2.23154878616333, - "kl": 6.8671875, - "learning_rate": 4.192276147587387e-07, - "loss": 0.4102, - "num_tokens": 1023403076.0, - "reward": 1.86181640625, - "reward_std": 0.5138826370239258, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.1953708976507187, + "grad_norm": 2.9222052097320557, + "kl": 3.73828125, + "learning_rate": 4.1943604892911744e-07, + "loss": 0.2572, + "num_tokens": 1057311801.0, + "reward": 1.06982421875, + "reward_std": 0.2826596200466156, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.94482421875, + "rewards/tag_count_reward/std": 0.16028828918933868, "step": 1859 }, { @@ -53926,27 +53926,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1982.0, - "completions/mean_length": 810.333984375, - "completions/mean_terminated_length": 762.6348876953125, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/max_terminated_length": 1731.0, + "completions/mean_length": 764.890625, + "completions/mean_terminated_length": 754.7874145507812, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.6349748229068874, - "grad_norm": 0.7847080230712891, - "kl": 7.265625, - "learning_rate": 4.1871473652460265e-07, - "loss": 0.465, - "num_tokens": 1023899343.0, - "reward": 1.83837890625, - "reward_std": 0.5229803323745728, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.19297432899475098, + "grad_norm": 9.997632026672363, + "kl": 3.25390625, + "learning_rate": 4.1892290063508596e-07, + "loss": 0.1433, + "num_tokens": 1057784801.0, + "reward": 1.09033203125, + "reward_std": 0.3576526641845703, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.16897699236869812, "step": 1860 }, { @@ -53955,27 +53955,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1950.0, - "completions/mean_length": 822.53515625, - "completions/mean_terminated_length": 772.719482421875, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2043.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 763.3046875, + "completions/mean_terminated_length": 763.3046875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.6353162072202783, - "grad_norm": 0.9601285457611084, - "kl": 7.3125, - "learning_rate": 4.182020446258857e-07, - "loss": 0.4548, - "num_tokens": 1024408337.0, - "reward": 1.798828125, - "reward_std": 0.527459979057312, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.19534705579280853, + "grad_norm": 5.027781963348389, + "kl": 2.533203125, + "learning_rate": 4.1840993852220284e-07, + "loss": 0.1987, + "num_tokens": 1058263469.0, + "reward": 1.02197265625, + "reward_std": 0.19869840145111084, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.96142578125, + "rewards/tag_count_reward/std": 0.12648846209049225, "step": 1861 }, { @@ -53984,27 +53984,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1833.0, - "completions/mean_length": 821.248046875, - "completions/mean_terminated_length": 784.2233276367188, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 787.03125, + "completions/mean_terminated_length": 784.5635986328125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.6356575915336691, - "grad_norm": 1.136073350906372, - "kl": 7.125, - "learning_rate": 4.1768953979026024e-07, - "loss": 0.3985, - "num_tokens": 1024903952.0, - "reward": 1.841796875, - "reward_std": 0.5617328882217407, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.19183269143104553, + "grad_norm": 1.810512661933899, + "kl": 2.578125, + "learning_rate": 4.178971633190762e-07, + "loss": 0.151, + "num_tokens": 1058741565.0, + "reward": 1.07666015625, + "reward_std": 0.2865458130836487, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15143637359142303, + "rewards/tag_count_reward/mean": 0.94189453125, + "rewards/tag_count_reward/std": 0.16077639162540436, "step": 1862 }, { @@ -54013,27 +54013,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 766.908203125, - "completions/mean_terminated_length": 736.1620483398438, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 710.56640625, + "completions/mean_terminated_length": 705.3215942382812, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.6359989758470598, - "grad_norm": 1.2205417156219482, - "kl": 6.1796875, - "learning_rate": 4.171772227451331e-07, - "loss": 0.3771, - "num_tokens": 1025373601.0, - "reward": 1.90625, - "reward_std": 0.5341774225234985, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.1933014690876007, + "grad_norm": 3.8119871616363525, + "kl": 3.1953125, + "learning_rate": 4.173845757540493e-07, + "loss": 0.1931, + "num_tokens": 1059182367.0, + "reward": 1.1201171875, + "reward_std": 0.35426056385040283, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.1702537089586258, "step": 1863 }, { @@ -54042,27 +54042,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1950.0, - "completions/mean_length": 775.53515625, - "completions/mean_terminated_length": 726.4949340820312, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 1772.0, + "completions/mean_length": 781.9453125, + "completions/mean_terminated_length": 779.4677124023438, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.6363403601604506, - "grad_norm": 1.4155833721160889, - "kl": 5.9140625, - "learning_rate": 4.166650942176447e-07, - "loss": 0.3838, - "num_tokens": 1025847987.0, - "reward": 1.8427734375, - "reward_std": 0.5575990676879883, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.2015654444694519, + "grad_norm": 3.6843318939208984, + "kl": 2.814453125, + "learning_rate": 4.1687217655519813e-07, + "loss": 0.1615, + "num_tokens": 1059660035.0, + "reward": 1.12939453125, + "reward_std": 0.357856810092926, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.94189453125, + "rewards/tag_count_reward/std": 0.1615353375673294, "step": 1864 }, { @@ -54071,27 +54071,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 809.529296875, - "completions/mean_terminated_length": 745.9528198242188, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 760.0703125, + "completions/mean_terminated_length": 752.4793701171875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, "epoch": 0.6366817444738414, - "grad_norm": 0.8453729152679443, - "kl": 7.4453125, - "learning_rate": 4.1615315493466797e-07, - "loss": 0.5078, - "num_tokens": 1026348754.0, - "reward": 1.82275390625, - "reward_std": 0.5655902028083801, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.20901454985141754, + "grad_norm": 5.010125160217285, + "kl": 2.537109375, + "learning_rate": 4.163599664503319e-07, + "loss": 0.1691, + "num_tokens": 1060135479.0, + "reward": 1.0888671875, + "reward_std": 0.3207041621208191, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.1765792965888977, "step": 1865 }, { @@ -54100,27 +54100,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1835.0, - "completions/mean_length": 736.716796875, - "completions/mean_terminated_length": 694.4172973632812, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 716.484375, + "completions/mean_terminated_length": 713.878662109375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.6370231287872322, - "grad_norm": 0.8038697242736816, - "kl": 7.4375, - "learning_rate": 4.156414056228065e-07, - "loss": 0.4755, - "num_tokens": 1026800657.0, - "reward": 1.83056640625, - "reward_std": 0.5810835957527161, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.1923394650220871, + "grad_norm": 2.8067944049835205, + "kl": 2.197265625, + "learning_rate": 4.158479461669905e-07, + "loss": 0.1318, + "num_tokens": 1060577023.0, + "reward": 1.0625, + "reward_std": 0.28924068808555603, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.15596230328083038, "step": 1866 }, { @@ -54129,27 +54129,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1982.0, - "completions/mean_length": 731.671875, - "completions/mean_terminated_length": 661.2510375976562, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/max_terminated_length": 1901.0, + "completions/mean_length": 683.732421875, + "completions/mean_terminated_length": 675.6915893554688, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, "epoch": 0.637364513100623, - "grad_norm": 1.3973826169967651, - "kl": 7.515625, - "learning_rate": 4.151298470083954e-07, - "loss": 0.4895, - "num_tokens": 1027251289.0, - "reward": 1.85205078125, - "reward_std": 0.5546400547027588, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.2020028978586197, + "grad_norm": 5.638597011566162, + "kl": 2.3671875, + "learning_rate": 4.1533611643244484e-07, + "loss": 0.1841, + "num_tokens": 1061003110.0, + "reward": 1.09130859375, + "reward_std": 0.2875339388847351, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.025390625, + "rewards/format_reward/std": 0.15746226906776428, + "rewards/tag_count_reward/mean": 0.95263671875, + "rewards/tag_count_reward/std": 0.15188921988010406, "step": 1867 }, { @@ -54158,27 +54158,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 783.55859375, - "completions/mean_terminated_length": 742.7701416015625, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 767.98046875, + "completions/mean_terminated_length": 762.9608154296875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.6377058974140138, - "grad_norm": 1.1674144268035889, - "kl": 6.3125, - "learning_rate": 4.146184798174983e-07, - "loss": 0.3714, - "num_tokens": 1027726983.0, - "reward": 1.88427734375, - "reward_std": 0.581419825553894, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.1974821388721466, + "grad_norm": 5.758982181549072, + "kl": 3.06640625, + "learning_rate": 4.148244779736946e-07, + "loss": 0.1665, + "num_tokens": 1061470828.0, + "reward": 1.1572265625, + "reward_std": 0.3125010132789612, + "rewards/accuracy_reward/mean": 0.18359375, + "rewards/accuracy_reward/std": 0.3875311613082886, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.18003050982952118, "step": 1868 }, { @@ -54187,27 +54187,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 762.9296875, - "completions/mean_terminated_length": 729.450927734375, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 1727.0, + "completions/mean_length": 740.34375, + "completions/mean_terminated_length": 732.6365966796875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.6380472817274047, - "grad_norm": 1.9474679231643677, - "kl": 5.96875, - "learning_rate": 4.141073047759076e-07, - "loss": 0.3926, - "num_tokens": 1028195667.0, - "reward": 1.91845703125, - "reward_std": 0.5201526284217834, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17180897295475006, + "grad_norm": 4.374019145965576, + "kl": 2.48046875, + "learning_rate": 4.143130315174683e-07, + "loss": 0.1259, + "num_tokens": 1061927948.0, + "reward": 1.13818359375, + "reward_std": 0.31949272751808167, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.94873046875, + "rewards/tag_count_reward/std": 0.16082392632961273, "step": 1869 }, { @@ -54216,27 +54216,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 727.17578125, - "completions/mean_terminated_length": 706.2103881835938, - "completions/min_length": 85.0, - "completions/min_terminated_length": 85.0, + "completions/max_terminated_length": 1759.0, + "completions/mean_length": 746.935546875, + "completions/mean_terminated_length": 736.69091796875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.6383886660407955, - "grad_norm": 1.4593170881271362, - "kl": 3.56640625, - "learning_rate": 4.135963226091426e-07, - "loss": 0.2256, - "num_tokens": 1028659709.0, - "reward": 1.9609375, - "reward_std": 0.43016159534454346, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.91015625, - "rewards/format_reward/std": 0.2862374484539032, - "rewards/tag_count_reward/mean": 0.9609375, - "rewards/tag_count_reward/std": 0.14141270518302917, + "grad_norm": 4.042114734649658, + "kl": 1.990234375, + "learning_rate": 4.138017777902214e-07, + "loss": 0.1029, + "num_tokens": 1062402107.0, + "reward": 1.07421875, + "reward_std": 0.31056299805641174, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.025390625, + "rewards/format_reward/std": 0.15746226906776428, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.15163815021514893, "step": 1870 }, { @@ -54245,27 +54245,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 722.958984375, - "completions/mean_terminated_length": 685.7088012695312, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 734.103515625, + "completions/mean_terminated_length": 728.9510498046875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, "epoch": 0.6387300503541862, - "grad_norm": 1.3785734176635742, - "kl": 4.734375, - "learning_rate": 4.1308553404244927e-07, - "loss": 0.2784, - "num_tokens": 1029110824.0, - "reward": 1.97216796875, - "reward_std": 0.4718218445777893, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.94873046875, - "rewards/tag_count_reward/std": 0.1638377457857132, + "grad_norm": 2.9361367225646973, + "kl": 2.072265625, + "learning_rate": 4.1329071751813606e-07, + "loss": 0.1065, + "num_tokens": 1062858928.0, + "reward": 1.14208984375, + "reward_std": 0.3141011595726013, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15143637359142303, + "rewards/tag_count_reward/mean": 0.95458984375, + "rewards/tag_count_reward/std": 0.14592784643173218, "step": 1871 }, { @@ -54274,27 +54274,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1963.0, - "completions/mean_length": 746.15625, - "completions/mean_terminated_length": 725.4921264648438, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 714.966796875, + "completions/mean_terminated_length": 709.7392578125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.639071434667577, - "grad_norm": 1.8401607275009155, - "kl": 3.46875, - "learning_rate": 4.1257493980079825e-07, - "loss": 0.2358, - "num_tokens": 1029567480.0, - "reward": 1.96435546875, - "reward_std": 0.3924785256385803, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.919921875, - "rewards/format_reward/std": 0.271679550409317, - "rewards/tag_count_reward/mean": 0.95849609375, - "rewards/tag_count_reward/std": 0.14201197028160095, + "grad_norm": 6.739040851593018, + "kl": 2.357421875, + "learning_rate": 4.127798514271187e-07, + "loss": 0.1246, + "num_tokens": 1063299615.0, + "reward": 1.0751953125, + "reward_std": 0.27408266067504883, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.9501953125, + "rewards/tag_count_reward/std": 0.1570582091808319, "step": 1872 }, { @@ -54303,27 +54303,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 779.810546875, - "completions/mean_terminated_length": 749.3740234375, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_terminated_length": 1951.0, + "completions/mean_length": 751.359375, + "completions/mean_terminated_length": 746.2745361328125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.6394128189809678, - "grad_norm": 1.39500892162323, - "kl": 4.56640625, - "learning_rate": 4.120645406088846e-07, - "loss": 0.3221, - "num_tokens": 1030036919.0, - "reward": 1.970703125, - "reward_std": 0.4239633083343506, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.921875, - "rewards/format_reward/std": 0.26863065361976624, - "rewards/tag_count_reward/mean": 0.9609375, - "rewards/tag_count_reward/std": 0.14141270518302917, + "grad_norm": 2.6407740116119385, + "kl": 2.4140625, + "learning_rate": 4.122691802428011e-07, + "loss": 0.1392, + "num_tokens": 1063754487.0, + "reward": 1.125, + "reward_std": 0.33741819858551025, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.16762074828147888, "step": 1873 }, { @@ -54332,27 +54332,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 758.751953125, - "completions/mean_terminated_length": 733.0697631835938, - "completions/min_length": 206.0, - "completions/min_terminated_length": 206.0, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 758.544921875, + "completions/mean_terminated_length": 753.48828125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.6397542032943586, - "grad_norm": 1.3300796747207642, - "kl": 3.9140625, - "learning_rate": 4.1155433719112696e-07, - "loss": 0.2804, - "num_tokens": 1030504920.0, - "reward": 1.87744140625, - "reward_std": 0.3875640630722046, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.94775390625, - "rewards/tag_count_reward/std": 0.15820616483688354, + "grad_norm": 2.9055283069610596, + "kl": 2.29296875, + "learning_rate": 4.117587046905372e-07, + "loss": 0.1628, + "num_tokens": 1064222382.0, + "reward": 0.99755859375, + "reward_std": 0.25872889161109924, + "rewards/accuracy_reward/mean": 0.03629032149910927, + "rewards/accuracy_reward/std": 0.1872003972530365, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.17502138018608093, "step": 1874 }, { @@ -54361,27 +54361,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 756.87109375, - "completions/mean_terminated_length": 731.1514282226562, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 729.14453125, + "completions/mean_terminated_length": 708.2103881835938, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.6400955876077494, - "grad_norm": 1.3403873443603516, - "kl": 6.0546875, - "learning_rate": 4.1104433027166564e-07, - "loss": 0.3892, - "num_tokens": 1030978454.0, - "reward": 1.8720703125, - "reward_std": 0.4538940191268921, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.9443359375, - "rewards/tag_count_reward/std": 0.16574904322624207, + "grad_norm": 2.852483034133911, + "kl": 3.47265625, + "learning_rate": 4.112484254954038e-07, + "loss": 0.2315, + "num_tokens": 1064681720.0, + "reward": 1.044921875, + "reward_std": 0.2961057424545288, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.1720430850982666, "step": 1875 }, { @@ -54390,27 +54390,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1891.0, - "completions/mean_length": 758.2578125, - "completions/mean_terminated_length": 713.9636840820312, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1879.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 704.23828125, + "completions/mean_terminated_length": 704.23828125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.6404369719211402, - "grad_norm": 1.9409515857696533, - "kl": 7.125, - "learning_rate": 4.1053452057436213e-07, - "loss": 0.4875, - "num_tokens": 1031437210.0, - "reward": 1.84326171875, - "reward_std": 0.5266550183296204, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.18445254862308502, + "grad_norm": 6.543737888336182, + "kl": 1.83984375, + "learning_rate": 4.1073834338219827e-07, + "loss": 0.1258, + "num_tokens": 1065112818.0, + "reward": 1.064453125, + "reward_std": 0.28870296478271484, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.025390625, + "rewards/format_reward/std": 0.15746226906776428, + "rewards/tag_count_reward/mean": 0.947265625, + "rewards/tag_count_reward/std": 0.16299647092819214, "step": 1876 }, { @@ -54419,27 +54419,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 796.19140625, - "completions/mean_terminated_length": 755.8104858398438, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 825.724609375, + "completions/mean_terminated_length": 813.6705932617188, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.640778356234531, - "grad_norm": 2.3897781372070312, - "kl": 8.1328125, - "learning_rate": 4.1002490882279804e-07, - "loss": 0.5263, - "num_tokens": 1031921308.0, - "reward": 1.8662109375, - "reward_std": 0.48998963832855225, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.17729215323925018, + "grad_norm": 6.474228858947754, + "kl": 3.39453125, + "learning_rate": 4.1022845907543835e-07, + "loss": 0.2233, + "num_tokens": 1065612037.0, + "reward": 1.0458984375, + "reward_std": 0.3379908800125122, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.9189453125, + "rewards/tag_count_reward/std": 0.20034818351268768, "step": 1877 }, { @@ -54448,27 +54448,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 710.53515625, - "completions/mean_terminated_length": 689.3056030273438, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 715.380859375, + "completions/mean_terminated_length": 710.1549682617188, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.6411197405479219, - "grad_norm": 2.4686570167541504, - "kl": 6.171875, - "learning_rate": 4.0951549574027434e-07, - "loss": 0.3629, - "num_tokens": 1032361662.0, - "reward": 1.869140625, - "reward_std": 0.43076658248901367, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94921875, - "rewards/tag_count_reward/std": 0.1551761031150818, + "grad_norm": 3.2534339427948, + "kl": 2.259765625, + "learning_rate": 4.097187732993611e-07, + "loss": 0.1321, + "num_tokens": 1066054872.0, + "reward": 1.0419921875, + "reward_std": 0.2746192514896393, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.9541015625, + "rewards/tag_count_reward/std": 0.1445106863975525, "step": 1878 }, { @@ -54477,27 +54477,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 772.380859375, - "completions/mean_terminated_length": 720.5263671875, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 1893.0, + "completions/mean_length": 747.1640625, + "completions/mean_terminated_length": 739.4970703125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.6414611248613126, - "grad_norm": 4.238211154937744, - "kl": 9.546875, - "learning_rate": 4.0900628204980924e-07, - "loss": 0.5669, - "num_tokens": 1032825121.0, - "reward": 1.8388671875, - "reward_std": 0.5338704586029053, - "rewards/accuracy_reward/mean": 0.058467742055654526, - "rewards/accuracy_reward/std": 0.23486270010471344, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.20006181299686432, + "grad_norm": 4.012604236602783, + "kl": 2.330078125, + "learning_rate": 4.0920928677792067e-07, + "loss": 0.1503, + "num_tokens": 1066505420.0, + "reward": 1.03466796875, + "reward_std": 0.2672814726829529, + "rewards/accuracy_reward/mean": 0.08064515888690948, + "rewards/accuracy_reward/std": 0.2725643217563629, + "rewards/format_reward/mean": 0.013671875, + "rewards/format_reward/std": 0.1162383034825325, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.1588330715894699, "step": 1879 }, { @@ -54506,27 +54506,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 771.0703125, - "completions/mean_terminated_length": 737.8035888671875, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 753.23828125, + "completions/mean_terminated_length": 740.4694213867188, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, "epoch": 0.6418025091747034, - "grad_norm": 4.924428939819336, - "kl": 8.28125, - "learning_rate": 4.084972684741386e-07, - "loss": 0.4544, - "num_tokens": 1033305333.0, - "reward": 1.8779296875, - "reward_std": 0.5271910429000854, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.19587218761444092, + "grad_norm": 5.674792289733887, + "kl": 2.80859375, + "learning_rate": 4.08700000234789e-07, + "loss": 0.1879, + "num_tokens": 1066976502.0, + "reward": 1.1240234375, + "reward_std": 0.2948102355003357, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, + "rewards/format_reward/mean": 0.025390625, + "rewards/format_reward/std": 0.15746226906776428, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.16648533940315247, "step": 1880 }, { @@ -54535,27 +54535,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1922.0, - "completions/mean_length": 780.056640625, - "completions/mean_terminated_length": 749.6260375976562, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_terminated_length": 1805.0, + "completions/mean_length": 733.0703125, + "completions/mean_terminated_length": 725.3202514648438, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.6421438934880942, - "grad_norm": 2.503539800643921, - "kl": 7.171875, - "learning_rate": 4.079884557357142e-07, - "loss": 0.4537, - "num_tokens": 1033781650.0, - "reward": 1.84423828125, - "reward_std": 0.4704209268093109, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.18977881968021393, + "grad_norm": 2.302811622619629, + "kl": 2.1650390625, + "learning_rate": 4.081909143933536e-07, + "loss": 0.1217, + "num_tokens": 1067428762.0, + "reward": 1.04296875, + "reward_std": 0.25573617219924927, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.16840559244155884, "step": 1881 }, { @@ -54564,27 +54564,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 775.88671875, - "completions/mean_terminated_length": 742.7454833984375, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/max_terminated_length": 1732.0, + "completions/mean_length": 733.962890625, + "completions/mean_terminated_length": 728.8098754882812, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.642485277801485, - "grad_norm": 1.1857963800430298, - "kl": 5.984375, - "learning_rate": 4.0747984455670257e-07, - "loss": 0.4052, - "num_tokens": 1034259944.0, - "reward": 1.89892578125, - "reward_std": 0.49622827768325806, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94580078125, - "rewards/tag_count_reward/std": 0.16512742638587952, + "grad_norm": 5.937273025512695, + "kl": 2.890625, + "learning_rate": 4.076820299767173e-07, + "loss": 0.1737, + "num_tokens": 1067885591.0, + "reward": 1.1005859375, + "reward_std": 0.3303375542163849, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.9501953125, + "rewards/tag_count_reward/std": 0.155492901802063, "step": 1882 }, { @@ -54593,27 +54593,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1895.0, - "completions/mean_length": 749.095703125, - "completions/mean_terminated_length": 723.2211303710938, - "completions/min_length": 13.0, - "completions/min_terminated_length": 13.0, + "completions/max_terminated_length": 1917.0, + "completions/mean_length": 753.779296875, + "completions/mean_terminated_length": 743.5885620117188, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.6428266621148758, - "grad_norm": 0.7971708178520203, - "kl": 5.11328125, - "learning_rate": 4.069714356589844e-07, - "loss": 0.3169, - "num_tokens": 1034724873.0, - "reward": 1.91064453125, - "reward_std": 0.49229228496551514, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.93994140625, - "rewards/tag_count_reward/std": 0.17537043988704681, + "grad_norm": 3.903597593307495, + "kl": 2.572265625, + "learning_rate": 4.0717334770769627e-07, + "loss": 0.1822, + "num_tokens": 1068352918.0, + "reward": 1.08544921875, + "reward_std": 0.3102695643901825, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.17221449315547943, "step": 1883 }, { @@ -54622,27 +54622,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1744.0, - "completions/mean_length": 760.310546875, - "completions/mean_terminated_length": 724.1104125976562, - "completions/min_length": 62.0, - "completions/min_terminated_length": 62.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 788.97265625, + "completions/mean_terminated_length": 779.05908203125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.6431680464282666, - "grad_norm": 1.1620593070983887, - "kl": 4.72265625, - "learning_rate": 4.064632297641533e-07, - "loss": 0.3192, - "num_tokens": 1035192024.0, - "reward": 1.939453125, - "reward_std": 0.49195238947868347, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.17270830273628235, + "grad_norm": 3.0366134643554688, + "kl": 2.525390625, + "learning_rate": 4.066648683088203e-07, + "loss": 0.1695, + "num_tokens": 1068834744.0, + "reward": 1.11669921875, + "reward_std": 0.29594817757606506, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.025390625, + "rewards/format_reward/std": 0.15746226906776428, + "rewards/tag_count_reward/mean": 0.94482421875, + "rewards/tag_count_reward/std": 0.16628073155879974, "step": 1884 }, { @@ -54651,27 +54651,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 796.009765625, - "completions/mean_terminated_length": 763.392822265625, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 798.880859375, + "completions/mean_terminated_length": 793.982421875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.6435094307416575, - "grad_norm": 1.8055346012115479, - "kl": 5.65625, - "learning_rate": 4.05955227593514e-07, - "loss": 0.3936, - "num_tokens": 1035681293.0, - "reward": 1.85595703125, - "reward_std": 0.4806082844734192, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17322689294815063, + "grad_norm": 4.649672985076904, + "kl": 3.259765625, + "learning_rate": 4.0615659250232993e-07, + "loss": 0.2115, + "num_tokens": 1069325483.0, + "reward": 1.021484375, + "reward_std": 0.24730777740478516, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15143637359142303, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.18037252128124237, "step": 1885 }, { @@ -54680,27 +54680,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 755.978515625, - "completions/mean_terminated_length": 711.6060791015625, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 748.326171875, + "completions/mean_terminated_length": 735.5089111328125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.6438508150550483, - "grad_norm": 1.9870210886001587, - "kl": 5.98828125, - "learning_rate": 4.05447429868083e-07, - "loss": 0.431, - "num_tokens": 1036144626.0, - "reward": 1.91650390625, - "reward_std": 0.5447916984558105, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.18874379992485046, + "grad_norm": 2.148470163345337, + "kl": 2.80078125, + "learning_rate": 4.0564852101017754e-07, + "loss": 0.1861, + "num_tokens": 1069784898.0, + "reward": 1.16015625, + "reward_std": 0.29293882846832275, + "rewards/accuracy_reward/mean": 0.181640625, + "rewards/accuracy_reward/std": 0.38592514395713806, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.951171875, + "rewards/tag_count_reward/std": 0.15658599138259888, "step": 1886 }, { @@ -54709,27 +54709,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1886.0, - "completions/mean_length": 767.521484375, - "completions/mean_terminated_length": 720.8643798828125, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 767.845703125, + "completions/mean_terminated_length": 755.220947265625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, "epoch": 0.644192199368439, - "grad_norm": 1.1415534019470215, - "kl": 5.625, - "learning_rate": 4.049398373085862e-07, - "loss": 0.3773, - "num_tokens": 1036621789.0, - "reward": 1.89111328125, - "reward_std": 0.4812566041946411, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93994140625, - "rewards/tag_count_reward/std": 0.17041848599910736, + "grad_norm": 2.846689462661743, + "kl": 2.341796875, + "learning_rate": 4.051406545540248e-07, + "loss": 0.1499, + "num_tokens": 1070262227.0, + "reward": 1.07861328125, + "reward_std": 0.2663132846355438, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.021484375, + "rewards/format_reward/std": 0.14513419568538666, + "rewards/tag_count_reward/mean": 0.94970703125, + "rewards/tag_count_reward/std": 0.15414974093437195, "step": 1887 }, { @@ -54738,27 +54738,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1968.0, - "completions/mean_length": 773.734375, - "completions/mean_terminated_length": 719.2342529296875, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1855.0, + "completions/mean_length": 726.455078125, + "completions/mean_terminated_length": 716.0491943359375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.6445335836818298, - "grad_norm": 1.1535128355026245, - "kl": 7.625, - "learning_rate": 4.044324506354585e-07, - "loss": 0.5109, - "num_tokens": 1037091765.0, - "reward": 1.796875, - "reward_std": 0.5084168910980225, - "rewards/accuracy_reward/mean": 0.021484375, - "rewards/accuracy_reward/std": 0.14513419568538666, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.19859731197357178, + "grad_norm": 5.727444171905518, + "kl": 3.158203125, + "learning_rate": 4.046329938552424e-07, + "loss": 0.1719, + "num_tokens": 1070707996.0, + "reward": 1.0341796875, + "reward_std": 0.2779213786125183, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.2029850035905838, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.1672869324684143, "step": 1888 }, { @@ -54767,27 +54767,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 708.26953125, - "completions/mean_terminated_length": 665.0524291992188, - "completions/min_length": 101.0, - "completions/min_terminated_length": 101.0, + "completions/max_terminated_length": 1870.0, + "completions/mean_length": 707.712890625, + "completions/mean_terminated_length": 702.4569091796875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.6448749679952206, - "grad_norm": 1.4276211261749268, - "kl": 7.2265625, - "learning_rate": 4.0392527056884254e-07, - "loss": 0.447, - "num_tokens": 1037532111.0, - "reward": 1.80419921875, - "reward_std": 0.5305805206298828, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.19918283820152283, + "grad_norm": 6.767913341522217, + "kl": 2.8046875, + "learning_rate": 4.041255396349085e-07, + "loss": 0.1232, + "num_tokens": 1071148057.0, + "reward": 1.015625, + "reward_std": 0.24684959650039673, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.2029850035905838, + "rewards/format_reward/mean": 0.025390625, + "rewards/format_reward/std": 0.15746226906776428, + "rewards/tag_count_reward/mean": 0.947265625, + "rewards/tag_count_reward/std": 0.15920037031173706, "step": 1889 }, { @@ -54796,27 +54796,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 757.015625, - "completions/mean_terminated_length": 707.2616577148438, - "completions/min_length": 78.0, - "completions/min_terminated_length": 78.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 737.830078125, + "completions/mean_terminated_length": 732.6921997070312, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, "epoch": 0.6452163523086114, - "grad_norm": 1.2638360261917114, - "kl": 6.40625, - "learning_rate": 4.034182978285877e-07, - "loss": 0.4309, - "num_tokens": 1037990503.0, - "reward": 1.86181640625, - "reward_std": 0.46222448348999023, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.93798828125, - "rewards/tag_count_reward/std": 0.17815442383289337, + "grad_norm": 5.358652114868164, + "kl": 3.078125, + "learning_rate": 4.036182926138082e-07, + "loss": 0.1542, + "num_tokens": 1071596626.0, + "reward": 1.07275390625, + "reward_std": 0.2963595986366272, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.94384765625, + "rewards/tag_count_reward/std": 0.16521421074867249, "step": 1890 }, { @@ -54825,27 +54825,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1881.0, - "completions/mean_length": 763.83203125, - "completions/mean_terminated_length": 740.8548583984375, - "completions/min_length": 38.0, - "completions/min_terminated_length": 38.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 776.578125, + "completions/mean_terminated_length": 774.0900268554688, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.6455577366220022, - "grad_norm": 1.6150527000427246, - "kl": 5.5234375, - "learning_rate": 4.0291153313424874e-07, - "loss": 0.3192, - "num_tokens": 1038459409.0, - "reward": 1.8896484375, - "reward_std": 0.4685768783092499, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.18000927567481995, + "grad_norm": 15.217375755310059, + "kl": 3.2109375, + "learning_rate": 4.03111253512432e-07, + "loss": 0.1232, + "num_tokens": 1072072058.0, + "reward": 1.08203125, + "reward_std": 0.3244349956512451, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.947265625, + "rewards/tag_count_reward/std": 0.16523228585720062, "step": 1891 }, { @@ -54854,27 +54854,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1884.0, - "completions/mean_length": 783.49609375, - "completions/mean_terminated_length": 740.0687255859375, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 815.859375, + "completions/mean_terminated_length": 808.5972900390625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.645899120935393, - "grad_norm": 1.8988356590270996, - "kl": 6.3125, - "learning_rate": 4.024049772050857e-07, - "loss": 0.4145, - "num_tokens": 1038938847.0, - "reward": 1.84765625, - "reward_std": 0.5264095664024353, - "rewards/accuracy_reward/mean": 0.05040322616696358, - "rewards/accuracy_reward/std": 0.21899642050266266, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.18752038478851318, + "grad_norm": 3.4547815322875977, + "kl": 2.580078125, + "learning_rate": 4.0260442305097574e-07, + "loss": 0.1687, + "num_tokens": 1072568066.0, + "reward": 1.0244140625, + "reward_std": 0.27615582942962646, + "rewards/accuracy_reward/mean": 0.05645161122083664, + "rewards/accuracy_reward/std": 0.23102474212646484, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.17237325012683868, "step": 1892 }, { @@ -54883,27 +54883,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 806.35546875, - "completions/mean_terminated_length": 731.805419921875, - "completions/min_length": 30.0, - "completions/min_terminated_length": 30.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1971.0, + "completions/max_terminated_length": 1971.0, + "completions/mean_length": 744.0625, + "completions/mean_terminated_length": 744.0625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, "epoch": 0.6462405052487838, - "grad_norm": 2.275977849960327, - "kl": 7.921875, - "learning_rate": 4.01898630760062e-07, - "loss": 0.4934, - "num_tokens": 1039434949.0, - "reward": 1.82470703125, - "reward_std": 0.5624501705169678, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.2154705673456192, + "grad_norm": 7.716210842132568, + "kl": 2.43359375, + "learning_rate": 4.0209780194933796e-07, + "loss": 0.197, + "num_tokens": 1073032274.0, + "reward": 1.07666015625, + "reward_std": 0.2412114441394806, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12414088100194931, + "rewards/tag_count_reward/mean": 0.95361328125, + "rewards/tag_count_reward/std": 0.14729003608226776, "step": 1893 }, { @@ -54912,27 +54912,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1924.0, - "completions/mean_length": 782.603515625, - "completions/mean_terminated_length": 741.7842407226562, - "completions/min_length": 221.0, - "completions/min_terminated_length": 221.0, + "completions/max_terminated_length": 1798.0, + "completions/mean_length": 772.779296875, + "completions/mean_terminated_length": 767.7785034179688, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.6465818895621747, - "grad_norm": 1.3620362281799316, - "kl": 6.9453125, - "learning_rate": 4.0139249451784383e-07, - "loss": 0.4666, - "num_tokens": 1039908922.0, - "reward": 1.86181640625, - "reward_std": 0.5229591131210327, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.1854754239320755, + "grad_norm": 2.5311384201049805, + "kl": 2.1484375, + "learning_rate": 4.015913909271207e-07, + "loss": 0.1087, + "num_tokens": 1073501217.0, + "reward": 1.07470703125, + "reward_std": 0.29827556014060974, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.94189453125, + "rewards/tag_count_reward/std": 0.16601619124412537, "step": 1894 }, { @@ -54941,27 +54941,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1941.0, - "completions/mean_length": 822.70703125, - "completions/mean_terminated_length": 783.1814575195312, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/max_terminated_length": 1956.0, + "completions/mean_length": 828.41015625, + "completions/mean_terminated_length": 816.3826293945312, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.6469232738755654, - "grad_norm": 2.305196762084961, - "kl": 6.9453125, - "learning_rate": 4.00886569196799e-07, - "loss": 0.4086, - "num_tokens": 1040410996.0, - "reward": 1.86083984375, - "reward_std": 0.5651666522026062, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.2054828256368637, + "grad_norm": 2.1754817962646484, + "kl": 2.38671875, + "learning_rate": 4.010851907036268e-07, + "loss": 0.1485, + "num_tokens": 1074006211.0, + "reward": 1.146484375, + "reward_std": 0.28077906370162964, + "rewards/accuracy_reward/mean": 0.166015625, + "rewards/accuracy_reward/std": 0.3724585771560669, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15143637359142303, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.14200608432292938, "step": 1895 }, { @@ -54970,27 +54970,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 767.3203125, - "completions/mean_terminated_length": 720.6558837890625, - "completions/min_length": 16.0, - "completions/min_terminated_length": 16.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 759.87109375, + "completions/mean_terminated_length": 754.8196411132812, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.6472646581889562, - "grad_norm": 0.988052248954773, - "kl": 5.7578125, - "learning_rate": 4.003808555149961e-07, - "loss": 0.342, - "num_tokens": 1040889832.0, - "reward": 1.8408203125, - "reward_std": 0.5765224695205688, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.9111328125, - "rewards/tag_count_reward/std": 0.21195456385612488, + "grad_norm": 2.0289859771728516, + "kl": 2.078125, + "learning_rate": 4.005792019978607e-07, + "loss": 0.1013, + "num_tokens": 1074481233.0, + "reward": 1.150390625, + "reward_std": 0.336341917514801, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.14791233837604523, "step": 1896 }, { @@ -54999,27 +54999,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1802.0, - "completions/mean_length": 736.21875, - "completions/mean_terminated_length": 707.4171752929688, - "completions/min_length": 32.0, - "completions/min_terminated_length": 32.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 776.498046875, + "completions/mean_terminated_length": 761.4209594726562, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.647606042502347, - "grad_norm": 2.5826499462127686, - "kl": 5.265625, - "learning_rate": 3.9987535419020303e-07, - "loss": 0.3584, - "num_tokens": 1041344600.0, - "reward": 1.9052734375, - "reward_std": 0.4933793842792511, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.9443359375, - "rewards/tag_count_reward/std": 0.1743793785572052, + "grad_norm": 2.9222843647003174, + "kl": 2.359375, + "learning_rate": 4.000734255285252e-07, + "loss": 0.1657, + "num_tokens": 1074956624.0, + "reward": 1.07568359375, + "reward_std": 0.30586332082748413, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15143637359142303, + "rewards/tag_count_reward/mean": 0.94873046875, + "rewards/tag_count_reward/std": 0.15697528421878815, "step": 1897 }, { @@ -55028,27 +55028,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1908.0, - "completions/mean_length": 741.005859375, - "completions/mean_terminated_length": 696.1192016601562, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 722.1875, + "completions/mean_terminated_length": 716.98828125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, "epoch": 0.6479474268157378, - "grad_norm": 2.448514938354492, - "kl": 6.5078125, - "learning_rate": 3.993700659398863e-07, - "loss": 0.4556, - "num_tokens": 1041799115.0, - "reward": 1.845703125, - "reward_std": 0.4928697347640991, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.18937622010707855, + "grad_norm": 5.153934478759766, + "kl": 2.10546875, + "learning_rate": 3.995678620140227e-07, + "loss": 0.1346, + "num_tokens": 1075401504.0, + "reward": 1.0634765625, + "reward_std": 0.2519616484642029, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.9619140625, + "rewards/tag_count_reward/std": 0.12997308373451233, "step": 1898 }, { @@ -55057,27 +55057,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 840.2734375, - "completions/mean_terminated_length": 803.8229370117188, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 794.666015625, + "completions/mean_terminated_length": 787.2789916992188, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.6482888111291286, - "grad_norm": 1.5282440185546875, - "kl": 7.578125, - "learning_rate": 3.9886499148121055e-07, - "loss": 0.5177, - "num_tokens": 1042306039.0, - "reward": 1.81982421875, - "reward_std": 0.5226191282272339, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.18766237795352936, + "grad_norm": 2.8712620735168457, + "kl": 2.03125, + "learning_rate": 3.9906251217245234e-07, + "loss": 0.1056, + "num_tokens": 1075885077.0, + "reward": 1.04736328125, + "reward_std": 0.27815550565719604, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.94970703125, + "rewards/tag_count_reward/std": 0.1549411565065384, "step": 1899 }, { @@ -55086,27 +55086,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 786.603515625, - "completions/mean_terminated_length": 745.9132690429688, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 1859.0, + "completions/mean_length": 724.427734375, + "completions/mean_terminated_length": 721.8375854492188, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.6486301954425194, - "grad_norm": 1.0351145267486572, - "kl": 5.6015625, - "learning_rate": 3.9836013153103643e-07, - "loss": 0.3619, - "num_tokens": 1042789308.0, - "reward": 1.92578125, - "reward_std": 0.5136724710464478, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.16988569498062134, + "grad_norm": 2.0019853115081787, + "kl": 2.72265625, + "learning_rate": 3.985573767216104e-07, + "loss": 0.1637, + "num_tokens": 1076336512.0, + "reward": 1.11083984375, + "reward_std": 0.300670862197876, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.94677734375, + "rewards/tag_count_reward/std": 0.16018691658973694, "step": 1900 }, { @@ -55115,27 +55115,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 761.123046875, - "completions/mean_terminated_length": 714.2327880859375, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 774.89453125, + "completions/mean_terminated_length": 759.7984619140625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.6489715797559102, - "grad_norm": 1.6239062547683716, - "kl": 7.546875, - "learning_rate": 3.9785548680592027e-07, - "loss": 0.4746, - "num_tokens": 1043254859.0, - "reward": 1.86669921875, - "reward_std": 0.5596586465835571, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19512130320072174, + "grad_norm": 1.9455697536468506, + "kl": 3.20703125, + "learning_rate": 3.980524563789881e-07, + "loss": 0.1804, + "num_tokens": 1076809114.0, + "reward": 1.0556640625, + "reward_std": 0.30967390537261963, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.17590700089931488, "step": 1901 }, { @@ -55144,27 +55144,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1979.0, - "completions/mean_length": 842.279296875, - "completions/mean_terminated_length": 780.384033203125, - "completions/min_length": 93.0, - "completions/min_terminated_length": 93.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 777.01953125, + "completions/mean_terminated_length": 772.0353393554688, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.6493129640693011, - "grad_norm": 1.9798287153244019, - "kl": 7.1015625, - "learning_rate": 3.97351058022113e-07, - "loss": 0.4551, - "num_tokens": 1043762282.0, - "reward": 1.82177734375, - "reward_std": 0.5535703897476196, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20167623460292816, + "grad_norm": 4.026853084564209, + "kl": 2.80859375, + "learning_rate": 3.975477518617716e-07, + "loss": 0.152, + "num_tokens": 1077283124.0, + "reward": 1.07275390625, + "reward_std": 0.30001622438430786, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.1689939647912979, "step": 1902 }, { @@ -55173,27 +55173,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 842.6171875, - "completions/mean_terminated_length": 791.0631713867188, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 1944.0, + "completions/mean_length": 818.95703125, + "completions/mean_terminated_length": 811.7131958007812, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.6496543483826919, - "grad_norm": 1.2850828170776367, - "kl": 6.90625, - "learning_rate": 3.9684684589555894e-07, - "loss": 0.4394, - "num_tokens": 1044273286.0, - "reward": 1.826171875, - "reward_std": 0.5970179438591003, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91015625, - "rewards/tag_count_reward/std": 0.20686452090740204, + "grad_norm": 2.2637829780578613, + "kl": 2.580078125, + "learning_rate": 3.9704326388683994e-07, + "loss": 0.1264, + "num_tokens": 1077782014.0, + "reward": 1.08642578125, + "reward_std": 0.32736116647720337, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.17398646473884583, "step": 1903 }, { @@ -55202,27 +55202,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 793.005859375, - "completions/mean_terminated_length": 757.724853515625, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 796.12890625, + "completions/mean_terminated_length": 791.2196655273438, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.6499957326960826, - "grad_norm": 0.9929895401000977, - "kl": 6.6015625, - "learning_rate": 3.9634285114189505e-07, - "loss": 0.4152, - "num_tokens": 1044758633.0, - "reward": 1.828125, - "reward_std": 0.5333633422851562, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.18155530095100403, + "grad_norm": 4.218906402587891, + "kl": 2.01953125, + "learning_rate": 3.965389931707651e-07, + "loss": 0.0882, + "num_tokens": 1078268960.0, + "reward": 1.099609375, + "reward_std": 0.31150782108306885, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.15483088791370392, "step": 1904 }, { @@ -55231,27 +55231,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1879.0, - "completions/mean_length": 802.458984375, - "completions/mean_terminated_length": 772.5660400390625, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 1707.0, + "completions/mean_length": 795.498046875, + "completions/mean_terminated_length": 793.0469360351562, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.6503371170094734, - "grad_norm": 1.1284902095794678, - "kl": 4.95703125, - "learning_rate": 3.958390744764497e-07, - "loss": 0.284, - "num_tokens": 1045247860.0, - "reward": 1.83544921875, - "reward_std": 0.4603354036808014, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.1798466295003891, + "grad_norm": 2.017845869064331, + "kl": 2.3203125, + "learning_rate": 3.9603494042981e-07, + "loss": 0.1258, + "num_tokens": 1078754623.0, + "reward": 1.0224609375, + "reward_std": 0.3042353689670563, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.1765792965888977, "step": 1905 }, { @@ -55260,27 +55260,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 837.228515625, - "completions/mean_terminated_length": 772.4547119140625, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 1929.0, + "completions/mean_length": 813.421875, + "completions/mean_terminated_length": 808.5804443359375, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, "epoch": 0.6506785013228642, - "grad_norm": 1.227197289466858, - "kl": 8.1171875, - "learning_rate": 3.953355166142417e-07, - "loss": 0.511, - "num_tokens": 1045754345.0, - "reward": 1.7978515625, - "reward_std": 0.567955732345581, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.21227891743183136, + "grad_norm": 1.6041759252548218, + "kl": 1.958984375, + "learning_rate": 3.955311063799287e-07, + "loss": 0.0846, + "num_tokens": 1079248919.0, + "reward": 1.0732421875, + "reward_std": 0.30848151445388794, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.025390625, + "rewards/format_reward/std": 0.15746226906776428, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.16953378915786743, "step": 1906 }, { @@ -55289,27 +55289,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1933.0, - "completions/mean_length": 765.3984375, - "completions/mean_terminated_length": 726.6881103515625, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 1895.0, + "completions/mean_length": 752.515625, + "completions/mean_terminated_length": 744.8801879882812, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.651019885636255, - "grad_norm": 2.1376044750213623, - "kl": 4.8125, - "learning_rate": 3.9483217826997927e-07, - "loss": 0.3393, - "num_tokens": 1046224533.0, - "reward": 1.8876953125, - "reward_std": 0.49774402379989624, - "rewards/accuracy_reward/mean": 0.06666667014360428, - "rewards/accuracy_reward/std": 0.24970406293869019, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.17374257743358612, + "grad_norm": 2.248253107070923, + "kl": 2.5546875, + "learning_rate": 3.950274917367638e-07, + "loss": 0.1386, + "num_tokens": 1079712511.0, + "reward": 1.05810546875, + "reward_std": 0.29562437534332275, + "rewards/accuracy_reward/mean": 0.07708333432674408, + "rewards/accuracy_reward/std": 0.2670018970966339, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.94873046875, + "rewards/tag_count_reward/std": 0.1585259586572647, "step": 1907 }, { @@ -55318,27 +55318,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 811.185546875, - "completions/mean_terminated_length": 755.6550903320312, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 805.080078125, + "completions/mean_terminated_length": 802.6477661132812, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.6513612699496458, - "grad_norm": 1.065099835395813, - "kl": 7.1796875, - "learning_rate": 3.9432906015805946e-07, - "loss": 0.4794, - "num_tokens": 1046711876.0, - "reward": 1.84423828125, - "reward_std": 0.577957272529602, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.21513772010803223, + "grad_norm": 5.085806369781494, + "kl": 2.7421875, + "learning_rate": 3.9452409721564686e-07, + "loss": 0.1775, + "num_tokens": 1080196728.0, + "reward": 1.07421875, + "reward_std": 0.2965303063392639, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.17274148762226105, "step": 1908 }, { @@ -55347,27 +55347,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 822.78515625, - "completions/mean_terminated_length": 788.34130859375, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 816.642578125, + "completions/mean_terminated_length": 806.9468383789062, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.6517026542630366, - "grad_norm": 1.9318523406982422, - "kl": 5.5546875, - "learning_rate": 3.93826162992566e-07, - "loss": 0.3595, - "num_tokens": 1047217734.0, - "reward": 1.8408203125, - "reward_std": 0.5040803551673889, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.9287109375, - "rewards/tag_count_reward/std": 0.18845312297344208, + "grad_norm": 1.6927117109298706, + "kl": 2.060546875, + "learning_rate": 3.940209235315961e-07, + "loss": 0.0926, + "num_tokens": 1080699441.0, + "reward": 1.05322265625, + "reward_std": 0.29500052332878113, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.1760827749967575, "step": 1909 }, { @@ -55376,27 +55376,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 727.7421875, - "completions/mean_terminated_length": 696.0560302734375, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 760.736328125, + "completions/mean_terminated_length": 742.89306640625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.6520440385764275, - "grad_norm": 1.0456809997558594, - "kl": 4.80078125, - "learning_rate": 3.933234874872695e-07, - "loss": 0.3013, - "num_tokens": 1047669410.0, - "reward": 1.88232421875, - "reward_std": 0.4403984546661377, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, + "grad_norm": 2.669011354446411, + "kl": 3.09765625, + "learning_rate": 3.9351797139931684e-07, + "loss": 0.1651, + "num_tokens": 1081168010.0, + "reward": 1.06201171875, + "reward_std": 0.30134129524230957, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.17849735915660858, + "rewards/tag_count_reward/std": 0.17503775656223297, "step": 1910 }, { @@ -55405,27 +55405,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 795.82421875, - "completions/mean_terminated_length": 760.6224365234375, - "completions/min_length": 94.0, - "completions/min_terminated_length": 94.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 803.95703125, + "completions/mean_terminated_length": 796.624755859375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.6523854228898183, - "grad_norm": 1.4011372327804565, - "kl": 6.1640625, - "learning_rate": 3.92821034355626e-07, - "loss": 0.4124, - "num_tokens": 1048152824.0, - "reward": 1.83642578125, - "reward_std": 0.5365915298461914, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.1904222071170807, + "grad_norm": 3.475491762161255, + "kl": 2.80859375, + "learning_rate": 3.930152415331992e-07, + "loss": 0.1516, + "num_tokens": 1081655588.0, + "reward": 1.0400390625, + "reward_std": 0.3331858217716217, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.18649590015411377, "step": 1911 }, { @@ -55434,27 +55434,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1974.0, - "completions/mean_length": 747.33984375, - "completions/mean_terminated_length": 710.7750854492188, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1826.0, + "completions/max_terminated_length": 1826.0, + "completions/mean_length": 734.103515625, + "completions/mean_terminated_length": 734.103515625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, "epoch": 0.652726807203209, - "grad_norm": 1.0503382682800293, - "kl": 7.578125, - "learning_rate": 3.923188043107758e-07, - "loss": 0.4758, - "num_tokens": 1048608470.0, - "reward": 1.87451171875, - "reward_std": 0.5311700701713562, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.18700948357582092, + "grad_norm": 2.547541618347168, + "kl": 2.001953125, + "learning_rate": 3.925127346473179e-07, + "loss": 0.0879, + "num_tokens": 1082104457.0, + "reward": 1.11279296875, + "reward_std": 0.31989234685897827, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.94677734375, + "rewards/tag_count_reward/std": 0.15553821623325348, "step": 1912 }, { @@ -55463,27 +55463,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 820.09765625, - "completions/mean_terminated_length": 770.1829223632812, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 809.615234375, + "completions/mean_terminated_length": 802.3163452148438, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.6530681915165998, - "grad_norm": 1.3700637817382812, - "kl": 6.6484375, - "learning_rate": 3.9181679806554267e-07, - "loss": 0.4261, - "num_tokens": 1049103336.0, - "reward": 1.87939453125, - "reward_std": 0.4995768070220947, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.19684240221977234, + "grad_norm": 3.892587184906006, + "kl": 3.1015625, + "learning_rate": 3.9201045145543053e-07, + "loss": 0.1438, + "num_tokens": 1082593956.0, + "reward": 1.0576171875, + "reward_std": 0.3563315272331238, + "rewards/accuracy_reward/mean": 0.08064515888690948, + "rewards/accuracy_reward/std": 0.2725643217563629, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.19002850353717804, "step": 1913 }, { @@ -55492,27 +55492,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1922.0, - "completions/mean_length": 763.25, - "completions/mean_terminated_length": 727.1325073242188, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 789.63671875, + "completions/mean_terminated_length": 764.5697631835938, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.6534095758299906, - "grad_norm": 1.0656079053878784, - "kl": 6.4765625, - "learning_rate": 3.913150163324326e-07, - "loss": 0.3954, - "num_tokens": 1049569656.0, - "reward": 1.84521484375, - "reward_std": 0.5555644631385803, - "rewards/accuracy_reward/mean": 0.07459677755832672, - "rewards/accuracy_reward/std": 0.263004869222641, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.18939577043056488, + "grad_norm": 3.216252326965332, + "kl": 3.2734375, + "learning_rate": 3.9150839267097766e-07, + "loss": 0.2116, + "num_tokens": 1083073786.0, + "reward": 1.07861328125, + "reward_std": 0.319521963596344, + "rewards/accuracy_reward/mean": 0.09879032522439957, + "rewards/accuracy_reward/std": 0.2986815273761749, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.1828656941652298, "step": 1914 }, { @@ -55521,27 +55521,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 777.037109375, - "completions/mean_terminated_length": 717.2576293945312, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 757.3671875, + "completions/mean_terminated_length": 749.7603149414062, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.6537509601433814, - "grad_norm": 1.1124001741409302, - "kl": 7.5234375, - "learning_rate": 3.90813459823633e-07, - "loss": 0.4902, - "num_tokens": 1050042699.0, - "reward": 1.79638671875, - "reward_std": 0.5529178380966187, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.21013160049915314, + "grad_norm": 2.0655741691589355, + "kl": 2.3125, + "learning_rate": 3.9100655900708026e-07, + "loss": 0.1061, + "num_tokens": 1083536758.0, + "reward": 1.0625, + "reward_std": 0.32510071992874146, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.18146054446697235, "step": 1915 }, { @@ -55550,27 +55550,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 742.1875, - "completions/mean_terminated_length": 702.776611328125, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 754.4765625, + "completions/mean_terminated_length": 744.2913208007812, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.6540923444567722, - "grad_norm": 1.7929461002349854, - "kl": 5.6171875, - "learning_rate": 3.9031212925101144e-07, - "loss": 0.3663, - "num_tokens": 1050501563.0, - "reward": 1.8544921875, - "reward_std": 0.5551683902740479, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.2006722390651703, + "grad_norm": 3.49310302734375, + "kl": 2.6953125, + "learning_rate": 3.9050495117654e-07, + "loss": 0.1436, + "num_tokens": 1084001914.0, + "reward": 1.08837890625, + "reward_std": 0.3200463056564331, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.1732655018568039, "step": 1916 }, { @@ -55579,27 +55579,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 840.861328125, - "completions/mean_terminated_length": 786.6632080078125, - "completions/min_length": 213.0, - "completions/min_terminated_length": 213.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 861.119140625, + "completions/mean_terminated_length": 842.27978515625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.654433728770163, - "grad_norm": 0.7772568464279175, - "kl": 6.671875, - "learning_rate": 3.898110253261151e-07, - "loss": 0.4309, - "num_tokens": 1051008548.0, - "reward": 1.85986328125, - "reward_std": 0.4899458885192871, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.2024376094341278, + "grad_norm": 2.5223731994628906, + "kl": 3.0234375, + "learning_rate": 3.900035698918378e-07, + "loss": 0.1854, + "num_tokens": 1084519271.0, + "reward": 1.03173828125, + "reward_std": 0.3127293586730957, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.1779128909111023, "step": 1917 }, { @@ -55608,27 +55608,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1724.0, - "completions/mean_length": 816.435546875, - "completions/mean_terminated_length": 758.5091552734375, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 1870.0, + "completions/mean_length": 807.61328125, + "completions/mean_terminated_length": 802.7490844726562, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.6547751130835539, - "grad_norm": 1.9527724981307983, - "kl": 5.3125, - "learning_rate": 3.8931014876016944e-07, - "loss": 0.3757, - "num_tokens": 1051504435.0, - "reward": 1.87890625, - "reward_std": 0.465145468711853, - "rewards/accuracy_reward/mean": 0.060483869165182114, - "rewards/accuracy_reward/std": 0.2386218160390854, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.17270830273628235, + "grad_norm": 2.123964786529541, + "kl": 2.255859375, + "learning_rate": 3.895024158651329e-07, + "loss": 0.1, + "num_tokens": 1085010641.0, + "reward": 1.07763671875, + "reward_std": 0.3200598359107971, + "rewards/accuracy_reward/mean": 0.0947580635547638, + "rewards/accuracy_reward/std": 0.29317617416381836, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.17221449315547943, "step": 1918 }, { @@ -55637,27 +55637,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1868.0, - "completions/mean_length": 788.767578125, - "completions/mean_terminated_length": 748.1471557617188, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 799.63671875, + "completions/mean_terminated_length": 792.2789916992188, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.6551164973969447, - "grad_norm": 1.0534942150115967, - "kl": 4.74609375, - "learning_rate": 3.88809500264077e-07, - "loss": 0.2982, - "num_tokens": 1051987836.0, - "reward": 1.87109375, - "reward_std": 0.5006071329116821, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.18036192655563354, + "grad_norm": 1.5538283586502075, + "kl": 2.8984375, + "learning_rate": 3.8900148980826097e-07, + "loss": 0.1853, + "num_tokens": 1085499607.0, + "reward": 1.060546875, + "reward_std": 0.3631531596183777, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.21384188532829285, "step": 1919 }, { @@ -55666,27 +55666,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 765.619140625, - "completions/mean_terminated_length": 726.9154663085938, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 1933.0, + "completions/mean_length": 784.76953125, + "completions/mean_terminated_length": 769.79052734375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.6554578817103354, - "grad_norm": 1.3416204452514648, - "kl": 5.171875, - "learning_rate": 3.8830908054841673e-07, - "loss": 0.3329, - "num_tokens": 1052455305.0, - "reward": 1.83203125, - "reward_std": 0.5414870977401733, - "rewards/accuracy_reward/mean": 0.058467742055654526, - "rewards/accuracy_reward/std": 0.23486268520355225, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.17608344554901123, + "grad_norm": 7.512059211730957, + "kl": 2.689453125, + "learning_rate": 3.8850079243273514e-07, + "loss": 0.2133, + "num_tokens": 1085976881.0, + "reward": 1.08837890625, + "reward_std": 0.3127474784851074, + "rewards/accuracy_reward/mean": 0.11290322244167328, + "rewards/accuracy_reward/std": 0.3167939782142639, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.17538133263587952, "step": 1920 }, { @@ -55695,27 +55695,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1873.0, - "completions/mean_length": 784.564453125, - "completions/mean_terminated_length": 746.4325561523438, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 800.359375, + "completions/mean_terminated_length": 790.535400390625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.6557992660237262, - "grad_norm": 1.4465922117233276, - "kl": 6.9921875, - "learning_rate": 3.878088903234431e-07, - "loss": 0.415, - "num_tokens": 1052933530.0, - "reward": 1.78955078125, - "reward_std": 0.556403398513794, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.20086915791034698, + "grad_norm": 3.450766086578369, + "kl": 1.849609375, + "learning_rate": 3.880003244497427e-07, + "loss": 0.1215, + "num_tokens": 1086463193.0, + "reward": 1.06591796875, + "reward_std": 0.2908085882663727, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.94677734375, + "rewards/tag_count_reward/std": 0.16094864904880524, "step": 1921 }, { @@ -55724,27 +55724,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 861.740234375, - "completions/mean_terminated_length": 818.5162353515625, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 1865.0, + "completions/mean_length": 873.791015625, + "completions/mean_terminated_length": 864.5452880859375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.656140650337117, - "grad_norm": 2.138991594314575, - "kl": 7.296875, - "learning_rate": 3.873089302990844e-07, - "loss": 0.4315, - "num_tokens": 1053455509.0, - "reward": 1.81689453125, - "reward_std": 0.535413384437561, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.1964779943227768, + "grad_norm": 3.688720703125, + "kl": 2.40625, + "learning_rate": 3.8750008657014554e-07, + "loss": 0.1733, + "num_tokens": 1086991342.0, + "reward": 1.0263671875, + "reward_std": 0.31121253967285156, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.19258584082126617, "step": 1922 }, { @@ -55753,27 +55753,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 882.025390625, - "completions/mean_terminated_length": 817.115478515625, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 867.46875, + "completions/mean_terminated_length": 858.1732177734375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.6564820346505078, - "grad_norm": 3.2062578201293945, - "kl": 9.2421875, - "learning_rate": 3.868092011849425e-07, - "loss": 0.549, - "num_tokens": 1053998962.0, - "reward": 1.78662109375, - "reward_std": 0.598289430141449, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.21219956874847412, + "grad_norm": 8.26951789855957, + "kl": 2.400390625, + "learning_rate": 3.8700007950447856e-07, + "loss": 0.1594, + "num_tokens": 1087527342.0, + "reward": 1.064453125, + "reward_std": 0.34242507815361023, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.18991030752658844, "step": 1923 }, { @@ -55782,27 +55782,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1835.0, - "completions/mean_length": 733.099609375, - "completions/mean_terminated_length": 690.6834716796875, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 801.353515625, + "completions/mean_terminated_length": 791.5374145507812, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.6568234189638986, - "grad_norm": 1.269345998764038, - "kl": 6.4453125, - "learning_rate": 3.8630970369029146e-07, - "loss": 0.3937, - "num_tokens": 1054444997.0, - "reward": 1.84033203125, - "reward_std": 0.5486959218978882, - "rewards/accuracy_reward/mean": 0.05645161122083664, - "rewards/accuracy_reward/std": 0.23102475702762604, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.19724006950855255, + "grad_norm": 6.074746608734131, + "kl": 3.14453125, + "learning_rate": 3.865003039629491e-07, + "loss": 0.1474, + "num_tokens": 1088008323.0, + "reward": 1.0615234375, + "reward_std": 0.35644254088401794, + "rewards/accuracy_reward/mean": 0.0927419364452362, + "rewards/accuracy_reward/std": 0.2903633117675781, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.19473710656166077, "step": 1924 }, { @@ -55811,27 +55811,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, - "completions/mean_length": 815.125, - "completions/mean_terminated_length": 772.7838745117188, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/mean_length": 792.802734375, + "completions/mean_terminated_length": 782.9193115234375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, "epoch": 0.6571648032772894, - "grad_norm": 1.5820393562316895, - "kl": 7.1328125, - "learning_rate": 3.858104385240768e-07, - "loss": 0.441, - "num_tokens": 1054937733.0, - "reward": 1.83935546875, - "reward_std": 0.5692523717880249, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20106884837150574, + "grad_norm": 2.151301622390747, + "kl": 2.724609375, + "learning_rate": 3.86000760655435e-07, + "loss": 0.1751, + "num_tokens": 1088489630.0, + "reward": 1.07958984375, + "reward_std": 0.3261934220790863, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.17571881413459778, "step": 1925 }, { @@ -55840,27 +55840,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 800.107421875, - "completions/mean_terminated_length": 746.7352905273438, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 823.259765625, + "completions/mean_terminated_length": 813.6161499023438, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.6575061875906802, - "grad_norm": 2.613544464111328, - "kl": 7.4140625, - "learning_rate": 3.85311406394914e-07, - "loss": 0.4794, - "num_tokens": 1055423292.0, - "reward": 1.7880859375, - "reward_std": 0.5436952114105225, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.20418420433998108, + "grad_norm": 8.23900032043457, + "kl": 2.796875, + "learning_rate": 3.855014502914851e-07, + "loss": 0.1273, + "num_tokens": 1088987043.0, + "reward": 1.046875, + "reward_std": 0.3548493981361389, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.1852131336927414, "step": 1926 }, { @@ -55869,27 +55869,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 793.6015625, - "completions/mean_terminated_length": 758.3373413085938, - "completions/min_length": 94.0, - "completions/min_terminated_length": 94.0, + "completions/max_terminated_length": 1905.0, + "completions/mean_length": 811.849609375, + "completions/mean_terminated_length": 799.6588134765625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, "epoch": 0.6578475719040711, - "grad_norm": 1.9729578495025635, - "kl": 7.0078125, - "learning_rate": 3.848126080110878e-07, - "loss": 0.4253, - "num_tokens": 1055903920.0, - "reward": 1.859375, - "reward_std": 0.5435269474983215, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.19074371457099915, + "grad_norm": 3.4573681354522705, + "kl": 3.140625, + "learning_rate": 3.850023735803166e-07, + "loss": 0.1972, + "num_tokens": 1089477014.0, + "reward": 1.115234375, + "reward_std": 0.3450678288936615, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.1883339285850525, "step": 1927 }, { @@ -55898,27 +55898,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1954.0, - "completions/mean_length": 808.990234375, - "completions/mean_terminated_length": 745.3860473632812, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 807.71875, + "completions/mean_terminated_length": 795.4871826171875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, "epoch": 0.6581889562174618, - "grad_norm": 1.095579743385315, - "kl": 8.6328125, - "learning_rate": 3.8431404408055133e-07, - "loss": 0.5712, - "num_tokens": 1056406987.0, - "reward": 1.791015625, - "reward_std": 0.6053373217582703, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.904296875, - "rewards/tag_count_reward/std": 0.2203473001718521, + "grad_norm": 2.5229742527008057, + "kl": 2.37890625, + "learning_rate": 3.8450353123081545e-07, + "loss": 0.1326, + "num_tokens": 1089979430.0, + "reward": 1.08740234375, + "reward_std": 0.3329201936721802, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.1757296919822693, "step": 1928 }, { @@ -55927,27 +55927,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1945.0, - "completions/mean_length": 776.359375, - "completions/mean_terminated_length": 702.7933349609375, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 808.38671875, + "completions/mean_terminated_length": 783.6932373046875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.6585303405308526, - "grad_norm": 1.4815394878387451, - "kl": 7.609375, - "learning_rate": 3.8381571531092496e-07, - "loss": 0.4877, - "num_tokens": 1056887987.0, - "reward": 1.78564453125, - "reward_std": 0.5911592245101929, - "rewards/accuracy_reward/mean": 0.08064515888690948, - "rewards/accuracy_reward/std": 0.2725643217563629, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.89892578125, - "rewards/tag_count_reward/std": 0.22154901921749115, + "grad_norm": 2.9450507164001465, + "kl": 2.7578125, + "learning_rate": 3.8400492395153417e-07, + "loss": 0.1612, + "num_tokens": 1090476828.0, + "reward": 1.08740234375, + "reward_std": 0.3415449261665344, + "rewards/accuracy_reward/mean": 0.11290322244167328, + "rewards/accuracy_reward/std": 0.3167939782142639, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.18577399849891663, "step": 1929 }, { @@ -55956,27 +55956,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1923.0, - "completions/mean_length": 831.947265625, - "completions/mean_terminated_length": 753.5737915039062, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 815.77734375, + "completions/mean_terminated_length": 803.625244140625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, "epoch": 0.6588717248442434, - "grad_norm": 2.4619951248168945, - "kl": 7.9765625, - "learning_rate": 3.8331762240949503e-07, - "loss": 0.5882, - "num_tokens": 1057392616.0, - "reward": 1.76904296875, - "reward_std": 0.5942692756652832, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.2204204648733139, + "grad_norm": 2.0206379890441895, + "kl": 2.84765625, + "learning_rate": 3.8350655245069184e-07, + "loss": 0.1696, + "num_tokens": 1090973178.0, + "reward": 1.05078125, + "reward_std": 0.3212524354457855, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.1843028962612152, "step": 1930 }, { @@ -55985,27 +55985,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 857.029296875, - "completions/mean_terminated_length": 803.55712890625, - "completions/min_length": 69.0, - "completions/min_terminated_length": 69.0, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 880.236328125, + "completions/mean_terminated_length": 875.6569213867188, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.6592131091576342, - "grad_norm": 1.31281578540802, - "kl": 6.1484375, - "learning_rate": 3.8281976608321366e-07, - "loss": 0.3631, - "num_tokens": 1057910903.0, - "reward": 1.779296875, - "reward_std": 0.5633202791213989, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.19666332006454468, + "grad_norm": 7.919393539428711, + "kl": 3.0, + "learning_rate": 3.8300841743617227e-07, + "loss": 0.1117, + "num_tokens": 1091503347.0, + "reward": 1.05419921875, + "reward_std": 0.3303118944168091, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.17361809313297272, "step": 1931 }, { @@ -56014,27 +56014,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 718.65625, - "completions/mean_terminated_length": 684.0240478515625, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 762.11328125, + "completions/mean_terminated_length": 751.9881591796875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.659554493471025, - "grad_norm": 0.8023258447647095, - "kl": 4.671875, - "learning_rate": 3.823221470386965e-07, - "loss": 0.2573, - "num_tokens": 1058351991.0, - "reward": 1.8837890625, - "reward_std": 0.5155116319656372, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.18835169076919556, + "grad_norm": 6.266638278961182, + "kl": 3.234375, + "learning_rate": 3.8251051961552373e-07, + "loss": 0.1465, + "num_tokens": 1091966685.0, + "reward": 1.126953125, + "reward_std": 0.352092981338501, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.1748199313879013, "step": 1932 }, { @@ -56043,27 +56043,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1982.0, - "completions/mean_length": 796.8125, - "completions/mean_terminated_length": 732.5831909179688, - "completions/min_length": 68.0, - "completions/min_terminated_length": 68.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 814.884765625, + "completions/mean_terminated_length": 800.2628784179688, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.6598958777844158, - "grad_norm": 1.6733766794204712, - "kl": 6.53515625, - "learning_rate": 3.81824765982223e-07, - "loss": 0.4237, - "num_tokens": 1058835591.0, - "reward": 1.78076171875, - "reward_std": 0.5668002367019653, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.2089642435312271, + "grad_norm": 10.530084609985352, + "kl": 3.9453125, + "learning_rate": 3.8201285969595696e-07, + "loss": 0.1963, + "num_tokens": 1092459538.0, + "reward": 1.05810546875, + "reward_std": 0.33577898144721985, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.16935545206069946, "step": 1933 }, { @@ -56072,27 +56072,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 786.013671875, - "completions/mean_terminated_length": 750.5361328125, - "completions/min_length": 81.0, - "completions/min_terminated_length": 81.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 822.65625, + "completions/mean_terminated_length": 810.572021484375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.6602372620978066, - "grad_norm": 4.714256763458252, - "kl": 5.07421875, - "learning_rate": 3.8132762361973456e-07, - "loss": 0.3631, - "num_tokens": 1059313134.0, - "reward": 1.87939453125, - "reward_std": 0.5234352350234985, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.177461177110672, + "grad_norm": 6.426862716674805, + "kl": 2.96875, + "learning_rate": 3.8151543838434566e-07, + "loss": 0.0984, + "num_tokens": 1092955842.0, + "reward": 1.1064453125, + "reward_std": 0.37291496992111206, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.07421875, + "rewards/format_reward/std": 0.2623828947544098, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.17380855977535248, "step": 1934 }, { @@ -56101,27 +56101,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 828.1015625, - "completions/mean_terminated_length": 773.33056640625, - "completions/min_length": 53.0, - "completions/min_terminated_length": 53.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 822.7109375, + "completions/mean_terminated_length": 810.6272583007812, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.6605786464111975, - "grad_norm": 1.5419610738754272, - "kl": 6.0703125, - "learning_rate": 3.8083072065683373e-07, - "loss": 0.3605, - "num_tokens": 1059813762.0, - "reward": 1.76513671875, - "reward_std": 0.558008074760437, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.20506387948989868, + "grad_norm": 5.250598907470703, + "kl": 3.15234375, + "learning_rate": 3.8101825638722395e-07, + "loss": 0.1432, + "num_tokens": 1093453710.0, + "reward": 1.0634765625, + "reward_std": 0.36420419812202454, + "rewards/accuracy_reward/mean": 0.07661290466785431, + "rewards/accuracy_reward/std": 0.2662447690963745, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.17996680736541748, "step": 1935 }, { @@ -56130,27 +56130,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 774.724609375, - "completions/mean_terminated_length": 717.55712890625, - "completions/min_length": 77.0, - "completions/min_terminated_length": 77.0, + "completions/max_terminated_length": 1861.0, + "completions/mean_length": 802.263671875, + "completions/mean_terminated_length": 799.8258056640625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.6609200307245882, - "grad_norm": 1.3766438961029053, - "kl": 5.578125, - "learning_rate": 3.803340577987834e-07, - "loss": 0.3549, - "num_tokens": 1060287365.0, - "reward": 1.87255859375, - "reward_std": 0.5209040641784668, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.1974821388721466, + "grad_norm": 5.057370662689209, + "kl": 2.623046875, + "learning_rate": 3.805213144107865e-07, + "loss": 0.1308, + "num_tokens": 1093941413.0, + "reward": 1.07275390625, + "reward_std": 0.32390761375427246, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.94189453125, + "rewards/tag_count_reward/std": 0.16453613340854645, "step": 1936 }, { @@ -56159,27 +56159,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1939.0, - "completions/mean_length": 831.072265625, - "completions/mean_terminated_length": 768.6016845703125, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/max_terminated_length": 1968.0, + "completions/mean_length": 837.005859375, + "completions/mean_terminated_length": 834.635986328125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.661261415037979, - "grad_norm": 1.6967791318893433, - "kl": 6.421875, - "learning_rate": 3.7983763575050575e-07, - "loss": 0.4519, - "num_tokens": 1060789130.0, - "reward": 1.853515625, - "reward_std": 0.5324754118919373, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.186283141374588, + "grad_norm": 3.881486415863037, + "kl": 2.0146484375, + "learning_rate": 3.800246131608863e-07, + "loss": 0.1176, + "num_tokens": 1094446216.0, + "reward": 1.09130859375, + "reward_std": 0.30351150035858154, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.95068359375, + "rewards/tag_count_reward/std": 0.15604117512702942, "step": 1937 }, { @@ -56188,27 +56188,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1930.0, - "completions/mean_length": 832.05078125, - "completions/mean_terminated_length": 767.0, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 845.869140625, + "completions/mean_terminated_length": 829.2059936523438, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.6616027993513698, - "grad_norm": 1.568750262260437, - "kl": 8.0390625, - "learning_rate": 3.79341455216581e-07, - "loss": 0.5526, - "num_tokens": 1061298772.0, - "reward": 1.83056640625, - "reward_std": 0.579119861125946, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.20823590457439423, + "grad_norm": 3.580059051513672, + "kl": 3.234375, + "learning_rate": 3.7952815334303535e-07, + "loss": 0.1612, + "num_tokens": 1094962933.0, + "reward": 1.07421875, + "reward_std": 0.39027005434036255, + "rewards/accuracy_reward/mean": 0.10483870655298233, + "rewards/accuracy_reward/std": 0.30665475130081177, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.20013105869293213, "step": 1938 }, { @@ -56217,27 +56217,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 828.86328125, - "completions/mean_terminated_length": 792.068359375, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 857.09765625, + "completions/mean_terminated_length": 838.1945190429688, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.6619441836647606, - "grad_norm": 3.181682825088501, - "kl": 8.484375, - "learning_rate": 3.788455169012469e-07, - "loss": 0.4863, - "num_tokens": 1061797150.0, - "reward": 1.7841796875, - "reward_std": 0.5454611778259277, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.19414739310741425, + "grad_norm": 7.356101036071777, + "kl": 2.70703125, + "learning_rate": 3.790319356624021e-07, + "loss": 0.205, + "num_tokens": 1095475767.0, + "reward": 1.0458984375, + "reward_std": 0.29537346959114075, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.17380855977535248, "step": 1939 }, { @@ -56246,27 +56246,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 754.0625, - "completions/mean_terminated_length": 725.6527099609375, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 1761.0, + "completions/mean_length": 800.19921875, + "completions/mean_terminated_length": 792.8448486328125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.6622855679781514, - "grad_norm": 0.8705515265464783, - "kl": 6.1875, - "learning_rate": 3.783498215083967e-07, - "loss": 0.3681, - "num_tokens": 1062253006.0, - "reward": 1.875, - "reward_std": 0.4800451993942261, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.1816815733909607, + "grad_norm": 5.001293182373047, + "kl": 1.302734375, + "learning_rate": 3.7853596082381134e-07, + "loss": 0.0177, + "num_tokens": 1095955245.0, + "reward": 1.06396484375, + "reward_std": 0.28220146894454956, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.95654296875, + "rewards/tag_count_reward/std": 0.1396850049495697, "step": 1940 }, { @@ -56275,27 +56275,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1983.0, - "completions/mean_length": 795.724609375, - "completions/mean_terminated_length": 768.2295532226562, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 873.361328125, + "completions/mean_terminated_length": 866.4381713867188, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.6626269522915422, - "grad_norm": 1.591826319694519, - "kl": 6.5, - "learning_rate": 3.778543697415797e-07, - "loss": 0.3956, - "num_tokens": 1062733809.0, - "reward": 1.82763671875, - "reward_std": 0.5358124375343323, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.19686181843280792, + "grad_norm": 1.962906002998352, + "kl": 1.568359375, + "learning_rate": 3.780402295317426e-07, + "loss": 0.0793, + "num_tokens": 1096475798.0, + "reward": 1.1083984375, + "reward_std": 0.37327125668525696, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.17094823718070984, "step": 1941 }, { @@ -56304,27 +56304,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 760.849609375, - "completions/mean_terminated_length": 724.6646118164062, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 839.306640625, + "completions/mean_terminated_length": 829.7893676757812, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.662968336604933, - "grad_norm": 1.7542084455490112, - "kl": 6.140625, - "learning_rate": 3.77359162303999e-07, - "loss": 0.3728, - "num_tokens": 1063192404.0, - "reward": 1.91455078125, - "reward_std": 0.47773921489715576, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.1739315241575241, + "grad_norm": 2.693998098373413, + "kl": 2.765625, + "learning_rate": 3.775447424903302e-07, + "loss": 0.1718, + "num_tokens": 1096974563.0, + "reward": 1.1181640625, + "reward_std": 0.3343203663825989, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.1847042590379715, "step": 1942 }, { @@ -56333,27 +56333,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 811.21484375, - "completions/mean_terminated_length": 778.9940185546875, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 865.41015625, + "completions/mean_terminated_length": 853.74755859375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, "epoch": 0.6633097209183239, - "grad_norm": 1.9097706079483032, - "kl": 5.69921875, - "learning_rate": 3.7686419989851104e-07, - "loss": 0.3206, - "num_tokens": 1063690578.0, - "reward": 1.90087890625, - "reward_std": 0.5013086795806885, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.18353331089019775, + "grad_norm": 8.785478591918945, + "kl": 2.1640625, + "learning_rate": 3.770495004033606e-07, + "loss": 0.1616, + "num_tokens": 1097500485.0, + "reward": 1.06640625, + "reward_std": 0.274461030960083, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.021484375, + "rewards/format_reward/std": 0.14513419568538666, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.17754259705543518, "step": 1943 }, { @@ -56362,27 +56362,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 792.306640625, - "completions/mean_terminated_length": 751.8003540039062, - "completions/min_length": 101.0, - "completions/min_terminated_length": 101.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 861.705078125, + "completions/mean_terminated_length": 847.6383666992188, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, "epoch": 0.6636511052317146, - "grad_norm": 2.448972463607788, - "kl": 7.4765625, - "learning_rate": 3.763694832276242e-07, - "loss": 0.4391, - "num_tokens": 1064166671.0, - "reward": 1.8828125, - "reward_std": 0.5133916735649109, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.18677493929862976, + "grad_norm": 3.886937141418457, + "kl": 2.111328125, + "learning_rate": 3.76554503974273e-07, + "loss": 0.131, + "num_tokens": 1098012110.0, + "reward": 1.14306640625, + "reward_std": 0.3767285943031311, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.18275068700313568, "step": 1944 }, { @@ -56391,27 +56391,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 770.896484375, - "completions/mean_terminated_length": 740.2460327148438, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 848.392578125, + "completions/mean_terminated_length": 831.764404296875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.6639924895451054, - "grad_norm": 1.7370452880859375, - "kl": 7.4453125, - "learning_rate": 3.758750129934988e-07, - "loss": 0.4535, - "num_tokens": 1064637146.0, - "reward": 1.806640625, - "reward_std": 0.5249841213226318, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.18677493929862976, + "grad_norm": 1.5926653146743774, + "kl": 2.45703125, + "learning_rate": 3.7605975390615717e-07, + "loss": 0.1366, + "num_tokens": 1098522263.0, + "reward": 1.03955078125, + "reward_std": 0.3317958116531372, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.18672312796115875, "step": 1945 }, { @@ -56420,27 +56420,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 795.15625, - "completions/mean_terminated_length": 754.741943359375, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 866.37109375, + "completions/mean_terminated_length": 838.0120239257812, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.6643338738584962, - "grad_norm": 2.202965497970581, - "kl": 7.3125, - "learning_rate": 3.753807898979442e-07, - "loss": 0.437, - "num_tokens": 1065125418.0, - "reward": 1.7822265625, - "reward_std": 0.5233513116836548, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.20379072427749634, + "grad_norm": 3.5171382427215576, + "kl": 3.296875, + "learning_rate": 3.755652509017536e-07, + "loss": 0.2187, + "num_tokens": 1099046997.0, + "reward": 1.0009765625, + "reward_std": 0.3274975121021271, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.21218886971473694, "step": 1946 }, { @@ -56449,27 +56449,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 794.1875, - "completions/mean_terminated_length": 758.9397583007812, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 815.806640625, + "completions/mean_terminated_length": 798.7267456054688, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.664675258171887, - "grad_norm": 1.1559278964996338, - "kl": 6.0390625, - "learning_rate": 3.748868146424201e-07, - "loss": 0.3858, - "num_tokens": 1065612154.0, - "reward": 1.80908203125, - "reward_std": 0.5231696963310242, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.18341873586177826, + "grad_norm": 7.1579179763793945, + "kl": 2.640625, + "learning_rate": 3.7507099566345125e-07, + "loss": 0.2068, + "num_tokens": 1099544802.0, + "reward": 1.03076171875, + "reward_std": 0.29526475071907043, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.16935545206069946, "step": 1947 }, { @@ -56478,27 +56478,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 806.630859375, - "completions/mean_terminated_length": 763.9979858398438, - "completions/min_length": 218.0, - "completions/min_terminated_length": 218.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 853.416015625, + "completions/mean_terminated_length": 846.3753051757812, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.6650166424852778, - "grad_norm": 1.417567491531372, - "kl": 5.375, - "learning_rate": 3.7439308792803405e-07, - "loss": 0.3436, - "num_tokens": 1066102669.0, - "reward": 1.86669921875, - "reward_std": 0.56736159324646, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19079817831516266, + "grad_norm": 1.816637396812439, + "kl": 2.94140625, + "learning_rate": 3.745769888932876e-07, + "loss": 0.168, + "num_tokens": 1100059271.0, + "reward": 1.11376953125, + "reward_std": 0.37326788902282715, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.18652856349945068, "step": 1948 }, { @@ -56507,27 +56507,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 790.591796875, - "completions/mean_terminated_length": 747.4081420898438, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 1889.0, + "completions/mean_length": 784.5625, + "completions/mean_terminated_length": 779.60791015625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.6653580267986686, - "grad_norm": 1.6329373121261597, - "kl": 6.6953125, - "learning_rate": 3.738996104555406e-07, - "loss": 0.4305, - "num_tokens": 1066585612.0, - "reward": 1.861328125, - "reward_std": 0.591179370880127, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.20013105869293213, + "grad_norm": 2.8198273181915283, + "kl": 2.626953125, + "learning_rate": 3.740832312929465e-07, + "loss": 0.1489, + "num_tokens": 1100539127.0, + "reward": 1.12255859375, + "reward_std": 0.3355463743209839, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.18254666030406952, "step": 1949 }, { @@ -56536,27 +56536,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 863.728515625, - "completions/mean_terminated_length": 813.0774536132812, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 850.197265625, + "completions/mean_terminated_length": 835.9940795898438, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, "epoch": 0.6656994111120594, - "grad_norm": 2.3752756118774414, - "kl": 6.1328125, - "learning_rate": 3.734063829253411e-07, - "loss": 0.4053, - "num_tokens": 1067110257.0, - "reward": 1.8310546875, - "reward_std": 0.5816721320152283, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.20617742836475372, + "grad_norm": 3.4084203243255615, + "kl": 2.953125, + "learning_rate": 3.73589723563759e-07, + "loss": 0.1766, + "num_tokens": 1101056844.0, + "reward": 1.0390625, + "reward_std": 0.2964341342449188, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.17551816999912262, "step": 1950 }, { @@ -56565,27 +56565,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 811.49609375, - "completions/mean_terminated_length": 755.9795532226562, - "completions/min_length": 31.0, - "completions/min_terminated_length": 31.0, - "epoch": 0.6660407954254502, - "grad_norm": 0.8571250438690186, - "kl": 7.1328125, - "learning_rate": 3.7291340603748146e-07, - "loss": 0.452, - "num_tokens": 1067602031.0, - "reward": 1.80322265625, - "reward_std": 0.5911970138549805, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.19981031119823456, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 867.65625, + "completions/mean_terminated_length": 851.2951049804688, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.6660407954254502, + "grad_norm": 6.745882987976074, + "kl": 4.06640625, + "learning_rate": 3.7309646640670003e-07, + "loss": 0.2372, + "num_tokens": 1101577372.0, + "reward": 1.09326171875, + "reward_std": 0.39704084396362305, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.07421875, + "rewards/format_reward/std": 0.2623828947544098, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.19901005923748016, "step": 1951 }, { @@ -56594,27 +56594,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1949.0, - "completions/mean_length": 732.78515625, - "completions/mean_terminated_length": 690.3588256835938, - "completions/min_length": 89.0, - "completions/min_terminated_length": 89.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 754.130859375, + "completions/mean_terminated_length": 751.5988159179688, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.666382179738841, - "grad_norm": 1.4395986795425415, - "kl": 5.8984375, - "learning_rate": 3.724206804916526e-07, - "loss": 0.3438, - "num_tokens": 1068055121.0, - "reward": 1.83740234375, - "reward_std": 0.5787259340286255, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.19697342813014984, + "grad_norm": 5.3336381912231445, + "kl": 2.7890625, + "learning_rate": 3.7260346052238967e-07, + "loss": 0.1359, + "num_tokens": 1102041391.0, + "reward": 1.11474609375, + "reward_std": 0.34246891736984253, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.94091796875, + "rewards/tag_count_reward/std": 0.16640710830688477, "step": 1952 }, { @@ -56623,27 +56623,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1946.0, - "completions/mean_length": 736.90234375, - "completions/mean_terminated_length": 708.1157836914062, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 1832.0, + "completions/mean_length": 737.162109375, + "completions/mean_terminated_length": 734.5968627929688, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.6667235640522318, - "grad_norm": 1.1494282484054565, - "kl": 6.9609375, - "learning_rate": 3.7192820698718797e-07, - "loss": 0.4307, - "num_tokens": 1068510511.0, - "reward": 1.810546875, - "reward_std": 0.5527421236038208, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.196245014667511, + "grad_norm": 2.959735870361328, + "kl": 2.76171875, + "learning_rate": 3.721107066110901e-07, + "loss": 0.1418, + "num_tokens": 1102496914.0, + "reward": 1.05615234375, + "reward_std": 0.34262150526046753, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.92919921875, + "rewards/tag_count_reward/std": 0.18503700196743011, "step": 1953 }, { @@ -56652,27 +56652,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 801.71484375, - "completions/mean_terminated_length": 743.0960693359375, - "completions/min_length": 94.0, - "completions/min_terminated_length": 94.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 812.033203125, + "completions/mean_terminated_length": 787.412353515625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, "epoch": 0.6670649483656226, - "grad_norm": 1.0540169477462769, - "kl": 6.69140625, - "learning_rate": 3.7143598622306374e-07, - "loss": 0.4275, - "num_tokens": 1069007277.0, - "reward": 1.79931640625, - "reward_std": 0.5230346918106079, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.2022770643234253, + "grad_norm": 5.216389179229736, + "kl": 4.13671875, + "learning_rate": 3.716182053727067e-07, + "loss": 0.2641, + "num_tokens": 1102998963.0, + "reward": 1.01025390625, + "reward_std": 0.3005276024341583, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.1843547374010086, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.18532080948352814, "step": 1954 }, { @@ -56681,27 +56681,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 863.84765625, - "completions/mean_terminated_length": 800.4979248046875, - "completions/min_length": 8.0, - "completions/min_terminated_length": 8.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 884.01953125, + "completions/mean_terminated_length": 874.8543090820312, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, "epoch": 0.6674063326790134, - "grad_norm": 2.0837478637695312, - "kl": 8.3515625, - "learning_rate": 3.7094401889789715e-07, - "loss": 0.4981, - "num_tokens": 1069527295.0, - "reward": 1.7197265625, - "reward_std": 0.63157057762146, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.794921875, - "rewards/format_reward/std": 0.4041535556316376, - "rewards/tag_count_reward/mean": 0.8876953125, - "rewards/tag_count_reward/std": 0.23425979912281036, + "grad_norm": 5.178909778594971, + "kl": 3.453125, + "learning_rate": 3.7112595750678486e-07, + "loss": 0.1618, + "num_tokens": 1103529309.0, + "reward": 1.04541015625, + "reward_std": 0.3825531601905823, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.19279102981090546, "step": 1955 }, { @@ -56710,27 +56710,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1978.0, - "completions/mean_length": 807.505859375, - "completions/mean_terminated_length": 777.7340087890625, - "completions/min_length": 65.0, - "completions/min_terminated_length": 65.0, + "completions/max_terminated_length": 1922.0, + "completions/mean_length": 819.318359375, + "completions/mean_terminated_length": 809.6436767578125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.6677477169924042, - "grad_norm": 1.1107139587402344, - "kl": 6.5859375, - "learning_rate": 3.70452305709946e-07, - "loss": 0.3762, - "num_tokens": 1070018066.0, - "reward": 1.7998046875, - "reward_std": 0.5736995935440063, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.2075263112783432, + "grad_norm": 2.1413521766662598, + "kl": 2.21875, + "learning_rate": 3.706339637125109e-07, + "loss": 0.1239, + "num_tokens": 1104026128.0, + "reward": 1.09912109375, + "reward_std": 0.3082974851131439, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.94677734375, + "rewards/tag_count_reward/std": 0.15074624121189117, "step": 1956 }, { @@ -56739,27 +56739,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 812.60546875, - "completions/mean_terminated_length": 767.5911254882812, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 871.69140625, + "completions/mean_terminated_length": 860.0907592773438, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.668089101305795, - "grad_norm": 1.0080444812774658, - "kl": 7.5703125, - "learning_rate": 3.699608473571072e-07, - "loss": 0.5046, - "num_tokens": 1070511496.0, - "reward": 1.7890625, - "reward_std": 0.541830837726593, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.2087314873933792, + "grad_norm": 4.940948963165283, + "kl": 2.75, + "learning_rate": 3.701422246887096e-07, + "loss": 0.1397, + "num_tokens": 1104549810.0, + "reward": 1.0263671875, + "reward_std": 0.3215748071670532, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.1874362826347351, "step": 1957 }, { @@ -56768,27 +56768,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1912.0, - "completions/mean_length": 826.48828125, - "completions/mean_terminated_length": 779.4117431640625, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 848.365234375, + "completions/mean_terminated_length": 838.9193115234375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.6684304856191858, - "grad_norm": 0.8483101725578308, - "kl": 6.21875, - "learning_rate": 3.694696445369161e-07, - "loss": 0.3647, - "num_tokens": 1071007442.0, - "reward": 1.8505859375, - "reward_std": 0.5570029020309448, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.19773660600185394, + "grad_norm": 1.736469030380249, + "kl": 2.158203125, + "learning_rate": 3.6965074113384467e-07, + "loss": 0.1219, + "num_tokens": 1105056957.0, + "reward": 1.09814453125, + "reward_std": 0.3518895208835602, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.18965290486812592, "step": 1958 }, { @@ -56797,27 +56797,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1886.0, - "completions/mean_length": 801.98828125, - "completions/mean_terminated_length": 753.967529296875, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 835.8984375, + "completions/mean_terminated_length": 823.9447631835938, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.6687718699325766, - "grad_norm": 2.572129249572754, - "kl": 5.078125, - "learning_rate": 3.68978697946545e-07, - "loss": 0.3551, - "num_tokens": 1071490860.0, - "reward": 1.8896484375, - "reward_std": 0.4518565237522125, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.18270690739154816, + "grad_norm": 3.6304750442504883, + "kl": 1.994140625, + "learning_rate": 3.6915951374601584e-07, + "loss": 0.1148, + "num_tokens": 1105557737.0, + "reward": 1.0693359375, + "reward_std": 0.2911996841430664, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.9462890625, + "rewards/tag_count_reward/std": 0.15964092314243317, "step": 1959 }, { @@ -56826,27 +56826,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 810.318359375, - "completions/mean_terminated_length": 783.1437377929688, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 852.318359375, + "completions/mean_terminated_length": 838.1403198242188, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, "epoch": 0.6691132542459673, - "grad_norm": 1.489697813987732, - "kl": 5.30859375, - "learning_rate": 3.6848800828280303e-07, - "loss": 0.3124, - "num_tokens": 1071993823.0, - "reward": 1.83935546875, - "reward_std": 0.5098040699958801, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19512130320072174, + "grad_norm": 3.80436635017395, + "kl": 2.37109375, + "learning_rate": 3.686685432229604e-07, + "loss": 0.1437, + "num_tokens": 1106082204.0, + "reward": 1.0546875, + "reward_std": 0.30142658948898315, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.1748199313879013, "step": 1960 }, { @@ -56855,27 +56855,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1879.0, - "completions/mean_length": 810.70703125, - "completions/mean_terminated_length": 763.0222778320312, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 794.326171875, + "completions/mean_terminated_length": 791.872802734375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.6694546385593582, - "grad_norm": 1.9213225841522217, - "kl": 6.4765625, - "learning_rate": 3.67997576242134e-07, - "loss": 0.4127, - "num_tokens": 1072488105.0, - "reward": 1.84521484375, - "reward_std": 0.5162262320518494, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.20101657509803772, + "grad_norm": 1.9554775953292847, + "kl": 1.9921875, + "learning_rate": 3.681778302620494e-07, + "loss": 0.1007, + "num_tokens": 1106568099.0, + "reward": 1.091796875, + "reward_std": 0.34669455885887146, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.16402500867843628, "step": 1961 }, { @@ -56884,27 +56884,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1895.0, - "completions/mean_length": 756.341796875, - "completions/mean_terminated_length": 722.69140625, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 803.2890625, + "completions/mean_terminated_length": 788.5296630859375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.669796022872749, - "grad_norm": 0.9990555047988892, - "kl": 5.2890625, - "learning_rate": 3.67507402520617e-07, - "loss": 0.3254, - "num_tokens": 1072949272.0, - "reward": 1.962890625, - "reward_std": 0.476193904876709, - "rewards/accuracy_reward/mean": 0.1411290317773819, - "rewards/accuracy_reward/std": 0.3485061228275299, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.17759640514850616, + "grad_norm": 3.738330125808716, + "kl": 1.88671875, + "learning_rate": 3.6768737556028904e-07, + "loss": 0.1311, + "num_tokens": 1107053303.0, + "reward": 1.18701171875, + "reward_std": 0.3353784680366516, + "rewards/accuracy_reward/mean": 0.19556452333927155, + "rewards/accuracy_reward/std": 0.3970351219177246, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.16112665832042694, "step": 1962 }, { @@ -56913,27 +56913,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 758.56640625, - "completions/mean_terminated_length": 732.8804931640625, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 1921.0, + "completions/mean_length": 815.658203125, + "completions/mean_terminated_length": 805.9547119140625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.6701374071861398, - "grad_norm": 0.6265716552734375, - "kl": 4.88671875, - "learning_rate": 3.6701748781396367e-07, - "loss": 0.3077, - "num_tokens": 1073412394.0, - "reward": 1.927734375, - "reward_std": 0.4050692915916443, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.94921875, - "rewards/tag_count_reward/std": 0.1628674566745758, + "grad_norm": 3.9076175689697266, + "kl": 2.19140625, + "learning_rate": 3.671971798143181e-07, + "loss": 0.1074, + "num_tokens": 1107545656.0, + "reward": 1.08642578125, + "reward_std": 0.3265295922756195, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.94189453125, + "rewards/tag_count_reward/std": 0.16601619124412537, "step": 1963 }, { @@ -56942,27 +56942,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1857.0, - "completions/mean_length": 819.044921875, - "completions/mean_terminated_length": 753.29833984375, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 801.177734375, + "completions/mean_terminated_length": 793.8291015625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, "epoch": 0.6704787914995306, - "grad_norm": 4.422807216644287, - "kl": 9.9765625, - "learning_rate": 3.6652783281751873e-07, - "loss": 0.576, - "num_tokens": 1073909841.0, - "reward": 1.76220703125, - "reward_std": 0.6102453470230103, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.90283203125, - "rewards/tag_count_reward/std": 0.2210913747549057, + "grad_norm": 3.4179844856262207, + "kl": 1.978515625, + "learning_rate": 3.6670724372040796e-07, + "loss": 0.1082, + "num_tokens": 1108033955.0, + "reward": 1.09619140625, + "reward_std": 0.3545726239681244, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.08203125, + "rewards/format_reward/std": 0.2746807038784027, + "rewards/tag_count_reward/mean": 0.94580078125, + "rewards/tag_count_reward/std": 0.15985849499702454, "step": 1964 }, { @@ -56971,27 +56971,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 798.322265625, - "completions/mean_terminated_length": 750.1602172851562, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/max_terminated_length": 1890.0, + "completions/mean_length": 817.5, + "completions/mean_terminated_length": 795.4830932617188, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.6708201758129214, - "grad_norm": 3.0986688137054443, - "kl": 8.03125, - "learning_rate": 3.6603843822625734e-07, - "loss": 0.4542, - "num_tokens": 1074403590.0, - "reward": 1.85205078125, - "reward_std": 0.6030385494232178, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.20666059851646423, + "grad_norm": 4.996206760406494, + "kl": 2.814453125, + "learning_rate": 3.6621756797446066e-07, + "loss": 0.1313, + "num_tokens": 1108537523.0, + "reward": 1.13134765625, + "reward_std": 0.32183706760406494, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.94189453125, + "rewards/tag_count_reward/std": 0.1600138396024704, "step": 1965 }, { @@ -57000,27 +57000,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1956.0, - "completions/mean_length": 854.54296875, - "completions/mean_terminated_length": 811.0567016601562, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 873.158203125, + "completions/mean_terminated_length": 844.9620361328125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.6711615601263122, - "grad_norm": 2.9957141876220703, - "kl": 7.7109375, - "learning_rate": 3.6554930473478595e-07, - "loss": 0.4545, - "num_tokens": 1074912140.0, - "reward": 1.8427734375, - "reward_std": 0.530123233795166, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.19335831701755524, + "grad_norm": 3.2720115184783936, + "kl": 2.91015625, + "learning_rate": 3.6572815327200933e-07, + "loss": 0.1371, + "num_tokens": 1109055604.0, + "reward": 1.087890625, + "reward_std": 0.3635501265525818, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.17805851995944977, "step": 1966 }, { @@ -57029,27 +57029,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 828.880859375, - "completions/mean_terminated_length": 802.11376953125, - "completions/min_length": 95.0, - "completions/min_terminated_length": 95.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 838.10546875, + "completions/mean_terminated_length": 835.73779296875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.671502944439703, - "grad_norm": 0.9075517058372498, - "kl": 5.890625, - "learning_rate": 3.650604330373398e-07, - "loss": 0.3298, - "num_tokens": 1075415743.0, - "reward": 1.88134765625, - "reward_std": 0.515263557434082, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.17812223732471466, + "grad_norm": 3.358809471130371, + "kl": 1.7958984375, + "learning_rate": 3.652390003082151e-07, + "loss": 0.0579, + "num_tokens": 1109563930.0, + "reward": 1.1650390625, + "reward_std": 0.3534090518951416, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.9482421875, + "rewards/tag_count_reward/std": 0.14674162864685059, "step": 1967 }, { @@ -57058,27 +57058,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 820.759765625, - "completions/mean_terminated_length": 781.1713256835938, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 815.240234375, + "completions/mean_terminated_length": 812.8277587890625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.6718443287530937, - "grad_norm": 0.6880972981452942, - "kl": 5.3203125, - "learning_rate": 3.6457182382778315e-07, - "loss": 0.3174, - "num_tokens": 1075910580.0, - "reward": 1.8984375, - "reward_std": 0.49258172512054443, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.18104934692382812, + "grad_norm": 2.223949432373047, + "kl": 2.072265625, + "learning_rate": 3.647501097778685e-07, + "loss": 0.1013, + "num_tokens": 1110055941.0, + "reward": 1.15283203125, + "reward_std": 0.32636475563049316, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.94970703125, + "rewards/tag_count_reward/std": 0.14931321144104004, "step": 1968 }, { @@ -57087,27 +57087,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 845.310546875, - "completions/mean_terminated_length": 811.5, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 893.091796875, + "completions/mean_terminated_length": 883.998046875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.6721857130664846, - "grad_norm": 2.171827793121338, - "kl": 5.7890625, - "learning_rate": 3.6408347779960734e-07, - "loss": 0.3805, - "num_tokens": 1076421923.0, - "reward": 1.8515625, - "reward_std": 0.5119212865829468, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.18894177675247192, + "grad_norm": 6.9371747970581055, + "kl": 2.205078125, + "learning_rate": 3.6426148237538656e-07, + "loss": 0.0851, + "num_tokens": 1110591748.0, + "reward": 1.07763671875, + "reward_std": 0.35847485065460205, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.94091796875, + "rewards/tag_count_reward/std": 0.16859769821166992, "step": 1969 }, { @@ -57116,27 +57116,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1944.0, - "completions/mean_length": 853.38671875, - "completions/mean_terminated_length": 802.2933349609375, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 860.275390625, + "completions/mean_terminated_length": 843.8118896484375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.6725270973798754, - "grad_norm": 0.7126238346099854, - "kl": 6.28125, - "learning_rate": 3.6359539564593036e-07, - "loss": 0.3868, - "num_tokens": 1076944089.0, - "reward": 1.8876953125, - "reward_std": 0.4900527000427246, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.19244687259197235, + "grad_norm": 2.914273977279663, + "kl": 2.20703125, + "learning_rate": 3.6377311879481296e-07, + "loss": 0.0944, + "num_tokens": 1111117441.0, + "reward": 1.08837890625, + "reward_std": 0.36513030529022217, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.1808004528284073, "step": 1970 }, { @@ -57145,27 +57145,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1983.0, - "completions/mean_length": 827.697265625, - "completions/mean_terminated_length": 790.8671875, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 786.12109375, + "completions/mean_terminated_length": 773.676513671875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.6728684816932662, - "grad_norm": 1.5118255615234375, - "kl": 4.75, - "learning_rate": 3.631075780594952e-07, - "loss": 0.3059, - "num_tokens": 1077437662.0, - "reward": 1.88525390625, - "reward_std": 0.4044254422187805, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.900390625, - "rewards/format_reward/std": 0.29977133870124817, - "rewards/tag_count_reward/mean": 0.94775390625, - "rewards/tag_count_reward/std": 0.16795603930950165, + "grad_norm": 2.6593613624572754, + "kl": 1.8046875, + "learning_rate": 3.632850197298161e-07, + "loss": 0.0743, + "num_tokens": 1111589727.0, + "reward": 1.06103515625, + "reward_std": 0.3264227509498596, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.94970703125, + "rewards/tag_count_reward/std": 0.15651197731494904, "step": 1971 }, { @@ -57174,27 +57174,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 866.716796875, - "completions/mean_terminated_length": 800.9546508789062, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 870.744140625, + "completions/mean_terminated_length": 847.2928466796875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.673209866006657, - "grad_norm": 0.9211564660072327, - "kl": 6.71875, - "learning_rate": 3.626200257326697e-07, - "loss": 0.4365, - "num_tokens": 1077963629.0, - "reward": 1.8642578125, - "reward_std": 0.5401686429977417, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.20429649949073792, + "grad_norm": 2.6116867065429688, + "kl": 2.4765625, + "learning_rate": 3.6279718587368955e-07, + "loss": 0.14, + "num_tokens": 1112117756.0, + "reward": 1.0986328125, + "reward_std": 0.3741343319416046, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.18428993225097656, "step": 1972 }, { @@ -57203,27 +57203,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1961.0, - "completions/mean_length": 827.431640625, - "completions/mean_terminated_length": 788.0584716796875, - "completions/min_length": 42.0, - "completions/min_terminated_length": 42.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 831.884765625, + "completions/mean_terminated_length": 822.30908203125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.6735512503200478, - "grad_norm": 0.977889895439148, - "kl": 6.328125, - "learning_rate": 3.621327393574458e-07, - "loss": 0.3933, - "num_tokens": 1078466554.0, - "reward": 1.9169921875, - "reward_std": 0.5646554231643677, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.18000927567481995, + "grad_norm": 5.75847864151001, + "kl": 2.12109375, + "learning_rate": 3.6230961791934934e-07, + "loss": 0.143, + "num_tokens": 1112622961.0, + "reward": 1.13232421875, + "reward_std": 0.35911405086517334, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.94677734375, + "rewards/tag_count_reward/std": 0.16246140003204346, "step": 1973 }, { @@ -57232,27 +57232,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 856.5234375, - "completions/mean_terminated_length": 797.9262084960938, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 866.443359375, + "completions/mean_terminated_length": 847.6885375976562, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.6738926346334386, - "grad_norm": 1.3242242336273193, - "kl": 6.3203125, - "learning_rate": 3.616457196254367e-07, - "loss": 0.4215, - "num_tokens": 1078991414.0, - "reward": 1.82958984375, - "reward_std": 0.485893577337265, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.19532686471939087, + "grad_norm": 3.6523139476776123, + "kl": 2.201171875, + "learning_rate": 3.6182231655933437e-07, + "loss": 0.1174, + "num_tokens": 1113152900.0, + "reward": 1.02001953125, + "reward_std": 0.3131875991821289, + "rewards/accuracy_reward/mean": 0.033203125, + "rewards/accuracy_reward/std": 0.17934183776378632, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.1854754239320755, "step": 1974 }, { @@ -57261,27 +57261,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1934.0, - "completions/mean_length": 838.375, - "completions/mean_terminated_length": 801.8671875, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 888.130859375, + "completions/mean_terminated_length": 865.0259399414062, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.6742340189468294, - "grad_norm": 1.0018812417984009, - "kl": 5.53515625, - "learning_rate": 3.6115896722787833e-07, - "loss": 0.3668, - "num_tokens": 1079503526.0, - "reward": 1.89794921875, - "reward_std": 0.4896755814552307, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.1721867471933365, + "grad_norm": 3.562194347381592, + "kl": 2.365234375, + "learning_rate": 3.613352824858044e-07, + "loss": 0.108, + "num_tokens": 1113690487.0, + "reward": 1.08544921875, + "reward_std": 0.3480846881866455, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.07421875, + "rewards/format_reward/std": 0.2623828947544098, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.1884550154209137, "step": 1975 }, { @@ -57290,27 +57290,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 793.333984375, - "completions/mean_terminated_length": 752.86083984375, - "completions/min_length": 115.0, - "completions/min_terminated_length": 115.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 881.294921875, + "completions/mean_terminated_length": 862.7758178710938, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, "epoch": 0.6745754032602201, - "grad_norm": 1.165183424949646, - "kl": 6.2578125, - "learning_rate": 3.606724828556265e-07, - "loss": 0.3995, - "num_tokens": 1079994033.0, - "reward": 1.82861328125, - "reward_std": 0.4689714014530182, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.18737702071666718, + "grad_norm": 2.8378944396972656, + "kl": 2.150390625, + "learning_rate": 3.6084851639054e-07, + "loss": 0.1132, + "num_tokens": 1114226030.0, + "reward": 1.0546875, + "reward_std": 0.3625064492225647, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.19170314073562622, "step": 1976 }, { @@ -57319,27 +57319,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 782.591796875, - "completions/mean_terminated_length": 747.01806640625, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 840.76171875, + "completions/mean_terminated_length": 824.0277709960938, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.674916787573611, - "grad_norm": 1.5107274055480957, - "kl": 6.8046875, - "learning_rate": 3.6018626719915646e-07, - "loss": 0.431, - "num_tokens": 1080471184.0, - "reward": 1.88037109375, - "reward_std": 0.5095371007919312, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.17917592823505402, + "grad_norm": 3.299548864364624, + "kl": 2.84765625, + "learning_rate": 3.603620189649408e-07, + "loss": 0.1827, + "num_tokens": 1114732964.0, + "reward": 1.07958984375, + "reward_std": 0.37545472383499146, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.18990464508533478, "step": 1977 }, { @@ -57348,27 +57348,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 766.013671875, - "completions/mean_terminated_length": 729.973876953125, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 820.591796875, + "completions/mean_terminated_length": 796.1414794921875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.6752581718870018, - "grad_norm": 2.550865650177002, - "kl": 8.0859375, - "learning_rate": 3.5970032094856305e-07, - "loss": 0.4875, - "num_tokens": 1080937911.0, - "reward": 1.89404296875, - "reward_std": 0.5230495929718018, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.19280590116977692, + "grad_norm": 5.121732234954834, + "kl": 3.56640625, + "learning_rate": 3.5987579090002496e-07, + "loss": 0.2022, + "num_tokens": 1115227635.0, + "reward": 1.0732421875, + "reward_std": 0.38599497079849243, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.8994140625, + "rewards/tag_count_reward/std": 0.21305173635482788, "step": 1978 }, { @@ -57377,27 +57377,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1997.0, - "completions/mean_length": 820.462890625, - "completions/mean_terminated_length": 783.4144897460938, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2044.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 813.009765625, + "completions/mean_terminated_length": 813.009765625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.6755995562003926, - "grad_norm": 1.9703261852264404, - "kl": 7.1484375, - "learning_rate": 3.5921464479355744e-07, - "loss": 0.4242, - "num_tokens": 1081427444.0, - "reward": 1.8330078125, - "reward_std": 0.5129296779632568, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9306640625, - "rewards/tag_count_reward/std": 0.19175048172473907, + "grad_norm": 2.082010507583618, + "kl": 2.244140625, + "learning_rate": 3.593898328864279e-07, + "loss": 0.1115, + "num_tokens": 1115713352.0, + "reward": 1.05224609375, + "reward_std": 0.34903088212013245, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.18625685572624207, "step": 1979 }, { @@ -57406,27 +57406,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1816.0, - "completions/mean_length": 772.873046875, - "completions/mean_terminated_length": 721.03857421875, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 833.857421875, + "completions/mean_terminated_length": 829.0961303710938, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.6759409405137834, - "grad_norm": 1.5295605659484863, - "kl": 7.34375, - "learning_rate": 3.5872923942346875e-07, - "loss": 0.4682, - "num_tokens": 1081897843.0, - "reward": 1.880859375, - "reward_std": 0.4452829360961914, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.17328274250030518, + "grad_norm": 6.166898727416992, + "kl": 3.1796875, + "learning_rate": 3.589041456144017e-07, + "loss": 0.1486, + "num_tokens": 1116214975.0, + "reward": 1.02734375, + "reward_std": 0.3698437213897705, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.20910651981830597, "step": 1980 }, { @@ -57435,27 +57435,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 805.861328125, - "completions/mean_terminated_length": 770.9417114257812, - "completions/min_length": 54.0, - "completions/min_terminated_length": 54.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 897.015625, + "completions/mean_terminated_length": 871.7445068359375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.6762823248271742, - "grad_norm": 1.6627914905548096, - "kl": 4.974609375, - "learning_rate": 3.582441055272406e-07, - "loss": 0.3214, - "num_tokens": 1082400764.0, - "reward": 1.93310546875, - "reward_std": 0.5067430734634399, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.17917592823505402, + "grad_norm": 4.866971015930176, + "kl": 3.171875, + "learning_rate": 3.584187297738136e-07, + "loss": 0.1425, + "num_tokens": 1116764567.0, + "reward": 1.208984375, + "reward_std": 0.5000156164169312, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.39980348944664, + "rewards/format_reward/mean": 0.107421875, + "rewards/format_reward/std": 0.30995169281959534, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.21154166758060455, "step": 1981 }, { @@ -57464,27 +57464,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 818.62109375, - "completions/mean_terminated_length": 778.9636840820312, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 821.767578125, + "completions/mean_terminated_length": 799.8270263671875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.676623709140565, - "grad_norm": 1.4015086889266968, - "kl": 5.55078125, - "learning_rate": 3.577592437934321e-07, - "loss": 0.3658, - "num_tokens": 1082892522.0, - "reward": 1.99462890625, - "reward_std": 0.4820824861526489, - "rewards/accuracy_reward/mean": 0.134765625, - "rewards/accuracy_reward/std": 0.3418070077896118, - "rewards/format_reward/mean": 0.908203125, - "rewards/format_reward/std": 0.289021372795105, - "rewards/tag_count_reward/mean": 0.95166015625, - "rewards/tag_count_reward/std": 0.16694028675556183, + "grad_norm": 3.286105155944824, + "kl": 3.7109375, + "learning_rate": 3.579335860541456e-07, + "loss": 0.246, + "num_tokens": 1117257936.0, + "reward": 1.1552734375, + "reward_std": 0.4390408992767334, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.2049129605293274, "step": 1982 }, { @@ -57493,27 +57493,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1840.0, - "completions/mean_length": 803.07421875, - "completions/mean_terminated_length": 744.5194091796875, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 839.322265625, + "completions/mean_terminated_length": 817.69580078125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.6769650934539558, - "grad_norm": 2.1695973873138428, - "kl": 6.640625, - "learning_rate": 3.572746549102159e-07, - "loss": 0.4691, - "num_tokens": 1083388384.0, - "reward": 1.86376953125, - "reward_std": 0.4834197163581848, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.18620555102825165, + "grad_norm": 5.260016918182373, + "kl": 3.83203125, + "learning_rate": 3.574487151444927e-07, + "loss": 0.205, + "num_tokens": 1117772357.0, + "reward": 1.0322265625, + "reward_std": 0.3532659113407135, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.20008091628551483, "step": 1983 }, { @@ -57522,27 +57522,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 731.607421875, - "completions/mean_terminated_length": 721.2421264648438, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 799.982421875, + "completions/mean_terminated_length": 785.183837890625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.6773064777673465, - "grad_norm": 1.1995561122894287, - "kl": 2.927734375, - "learning_rate": 3.567903395653767e-07, - "loss": 0.1891, - "num_tokens": 1083837687.0, - "reward": 1.982421875, - "reward_std": 0.43703126907348633, - "rewards/accuracy_reward/mean": 0.11088709533214569, - "rewards/accuracy_reward/std": 0.3143092691898346, - "rewards/format_reward/mean": 0.91796875, - "rewards/format_reward/std": 0.2746807038784027, - "rewards/tag_count_reward/mean": 0.95703125, - "rewards/tag_count_reward/std": 0.1462491750717163, + "grad_norm": 3.0933825969696045, + "kl": 3.26171875, + "learning_rate": 3.5696411773356303e-07, + "loss": 0.1943, + "num_tokens": 1118256668.0, + "reward": 1.10546875, + "reward_std": 0.4066968858242035, + "rewards/accuracy_reward/mean": 0.1391129046678543, + "rewards/accuracy_reward/std": 0.34641367197036743, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.19974872469902039, "step": 1984 }, { @@ -57551,27 +57551,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 787.6796875, - "completions/mean_terminated_length": 772.7352294921875, - "completions/min_length": 211.0, - "completions/min_terminated_length": 211.0, + "completions/max_terminated_length": 1915.0, + "completions/mean_length": 842.900390625, + "completions/mean_terminated_length": 831.0158081054688, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.6776478620807374, - "grad_norm": 1.6124293804168701, - "kl": 3.671875, - "learning_rate": 3.563062984463123e-07, - "loss": 0.2372, - "num_tokens": 1084316835.0, - "reward": 1.9638671875, - "reward_std": 0.4849792420864105, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, - "rewards/format_reward/mean": 0.900390625, - "rewards/format_reward/std": 0.29977133870124817, - "rewards/tag_count_reward/mean": 0.9501953125, - "rewards/tag_count_reward/std": 0.1593773365020752, + "grad_norm": 2.8358771800994873, + "kl": 2.51171875, + "learning_rate": 3.5647979450967557e-07, + "loss": 0.1547, + "num_tokens": 1118764089.0, + "reward": 1.10888671875, + "reward_std": 0.3980877697467804, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.18548057973384857, "step": 1985 }, { @@ -57580,27 +57580,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1969.0, - "completions/mean_length": 768.302734375, - "completions/mean_terminated_length": 729.6800537109375, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 838.1328125, + "completions/mean_terminated_length": 811.56884765625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, "epoch": 0.6779892463941282, - "grad_norm": 1.0881859064102173, - "kl": 4.890625, - "learning_rate": 3.5582253224003e-07, - "loss": 0.3377, - "num_tokens": 1084791342.0, - "reward": 1.8779296875, - "reward_std": 0.4652717411518097, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.1779806911945343, + "grad_norm": 6.475921630859375, + "kl": 2.88671875, + "learning_rate": 3.559957461607608e-07, + "loss": 0.1223, + "num_tokens": 1119274349.0, + "reward": 1.12060546875, + "reward_std": 0.40999656915664673, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.08984375, + "rewards/format_reward/std": 0.2862374484539032, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.18160201609134674, "step": 1986 }, { @@ -57609,27 +57609,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 799.8203125, - "completions/mean_terminated_length": 754.340087890625, - "completions/min_length": 56.0, - "completions/min_terminated_length": 56.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 846.189453125, + "completions/mean_terminated_length": 839.1061401367188, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.678330630707519, - "grad_norm": 1.7563881874084473, - "kl": 6.609375, - "learning_rate": 3.553390416331478e-07, - "loss": 0.4096, - "num_tokens": 1085282130.0, - "reward": 1.8115234375, - "reward_std": 0.5413820147514343, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.19145125150680542, + "grad_norm": 2.7994697093963623, + "kl": 2.6640625, + "learning_rate": 3.555119733743576e-07, + "loss": 0.1466, + "num_tokens": 1119788878.0, + "reward": 1.04736328125, + "reward_std": 0.3752076327800751, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.1942230463027954, "step": 1987 }, { @@ -57638,27 +57638,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 832.158203125, - "completions/mean_terminated_length": 769.7433471679688, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 868.541015625, + "completions/mean_terminated_length": 854.5553588867188, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, "epoch": 0.6786720150209098, - "grad_norm": 2.784532070159912, - "kl": 8.6328125, - "learning_rate": 3.5485582731189176e-07, - "loss": 0.4979, - "num_tokens": 1085784931.0, - "reward": 1.7998046875, - "reward_std": 0.5651649236679077, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.216009259223938, + "grad_norm": 2.3474643230438232, + "kl": 2.169921875, + "learning_rate": 3.5502847683761426e-07, + "loss": 0.0831, + "num_tokens": 1120310307.0, + "reward": 1.0771484375, + "reward_std": 0.35952842235565186, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.08203125, + "rewards/format_reward/std": 0.2746807038784027, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.18374989926815033, "step": 1988 }, { @@ -57669,25 +57669,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 808.9921875, - "completions/mean_terminated_length": 774.1605834960938, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 1932.0, + "completions/mean_length": 877.611328125, + "completions/mean_terminated_length": 844.7088012695312, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.6790133993343006, - "grad_norm": 1.4216352701187134, - "kl": 6.53125, - "learning_rate": 3.5437288996209704e-07, - "loss": 0.3949, - "num_tokens": 1086277039.0, - "reward": 1.86474609375, - "reward_std": 0.5679696202278137, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19068296253681183, + "grad_norm": 3.081923246383667, + "kl": 2.171875, + "learning_rate": 3.5454525723728644e-07, + "loss": 0.1376, + "num_tokens": 1120837548.0, + "reward": 1.1044921875, + "reward_std": 0.3634968400001526, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.2019064873456955, "step": 1989 }, { @@ -57696,27 +57696,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 789.94921875, - "completions/mean_terminated_length": 738.8088989257812, - "completions/min_length": 57.0, - "completions/min_terminated_length": 57.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 840.35546875, + "completions/mean_terminated_length": 823.6158447265625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, "epoch": 0.6793547836476914, - "grad_norm": 3.0501508712768555, - "kl": 8.9375, - "learning_rate": 3.5389023026920384e-07, - "loss": 0.5632, - "num_tokens": 1086764821.0, - "reward": 1.80712890625, - "reward_std": 0.5999290943145752, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.90283203125, - "rewards/tag_count_reward/std": 0.22274482250213623, + "grad_norm": 1.98365318775177, + "kl": 2.28125, + "learning_rate": 3.5406231525973653e-07, + "loss": 0.1379, + "num_tokens": 1121351138.0, + "reward": 1.11328125, + "reward_std": 0.3751985430717468, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.17952290177345276, "step": 1990 }, { @@ -57725,27 +57725,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 793.982421875, - "completions/mean_terminated_length": 743.0060424804688, - "completions/min_length": 32.0, - "completions/min_terminated_length": 32.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 844.826171875, + "completions/mean_terminated_length": 825.7282104492188, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.6796961679610822, - "grad_norm": 3.246222972869873, - "kl": 8.07421875, - "learning_rate": 3.534078489182598e-07, - "loss": 0.4707, - "num_tokens": 1087252092.0, - "reward": 1.74365234375, - "reward_std": 0.5731247663497925, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.90380859375, - "rewards/tag_count_reward/std": 0.21364909410476685, + "grad_norm": 3.4340503215789795, + "kl": 2.275390625, + "learning_rate": 3.535796515909319e-07, + "loss": 0.1451, + "num_tokens": 1121864441.0, + "reward": 1.0244140625, + "reward_std": 0.30911821126937866, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.1875789761543274, "step": 1991 }, { @@ -57754,27 +57754,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 746.060546875, - "completions/mean_terminated_length": 695.8843383789062, - "completions/min_length": 78.0, - "completions/min_terminated_length": 78.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 832.080078125, + "completions/mean_terminated_length": 815.2257690429688, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.6800375522744729, - "grad_norm": 2.1648812294006348, - "kl": 7.1484375, - "learning_rate": 3.5292574659391716e-07, - "loss": 0.4566, - "num_tokens": 1087704699.0, - "reward": 1.8671875, - "reward_std": 0.5629680752754211, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.1937359869480133, + "grad_norm": 4.344378471374512, + "kl": 2.103515625, + "learning_rate": 3.530972669164463e-07, + "loss": 0.0752, + "num_tokens": 1122361090.0, + "reward": 1.1416015625, + "reward_std": 0.3675711750984192, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.16507895290851593, "step": 1992 }, { @@ -57783,27 +57783,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.072265625, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 822.86328125, - "completions/mean_terminated_length": 727.4315185546875, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 1951.0, + "completions/mean_length": 832.84765625, + "completions/mean_terminated_length": 806.1676635742188, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.6803789365878637, - "grad_norm": 3.0087924003601074, - "kl": 9.5234375, - "learning_rate": 3.524439239804318e-07, - "loss": 0.6276, - "num_tokens": 1088205829.0, - "reward": 1.78857421875, - "reward_std": 0.6144096851348877, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.80078125, - "rewards/format_reward/std": 0.39980348944664, - "rewards/tag_count_reward/mean": 0.88623046875, - "rewards/tag_count_reward/std": 0.2400074452161789, + "grad_norm": 1.649202823638916, + "kl": 2.43359375, + "learning_rate": 3.5261516192145523e-07, + "loss": 0.1047, + "num_tokens": 1122867332.0, + "reward": 1.10205078125, + "reward_std": 0.42149943113327026, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.08203125, + "rewards/format_reward/std": 0.2746807038784027, + "rewards/tag_count_reward/mean": 0.90869140625, + "rewards/tag_count_reward/std": 0.20710861682891846, "step": 1993 }, { @@ -57812,27 +57812,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1979.0, - "completions/mean_length": 776.033203125, - "completions/mean_terminated_length": 718.9244384765625, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 834.421875, + "completions/mean_terminated_length": 802.8056030273438, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.6807203209012546, - "grad_norm": 1.7864247560501099, - "kl": 6.5, - "learning_rate": 3.519623817616629e-07, - "loss": 0.4486, - "num_tokens": 1088685478.0, - "reward": 1.83984375, - "reward_std": 0.5918079614639282, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.2059757262468338, + "grad_norm": 4.389552593231201, + "kl": 2.513671875, + "learning_rate": 3.5213333729073823e-07, + "loss": 0.171, + "num_tokens": 1123376876.0, + "reward": 1.10205078125, + "reward_std": 0.35682186484336853, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.1931772381067276, "step": 1994 }, { @@ -57841,27 +57841,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 761.509765625, - "completions/mean_terminated_length": 717.3273315429688, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 845.3359375, + "completions/mean_terminated_length": 823.8170776367188, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.6810617052146454, - "grad_norm": 1.4981460571289062, - "kl": 5.1953125, - "learning_rate": 3.514811206210716e-07, - "loss": 0.3131, - "num_tokens": 1089149531.0, - "reward": 1.828125, - "reward_std": 0.4942927360534668, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.1919422447681427, + "grad_norm": 3.0156242847442627, + "kl": 2.021484375, + "learning_rate": 3.5165179370867593e-07, + "loss": 0.1032, + "num_tokens": 1123883848.0, + "reward": 1.0595703125, + "reward_std": 0.3213019073009491, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.1731034219264984, "step": 1995 }, { @@ -57870,27 +57870,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 801.904296875, - "completions/mean_terminated_length": 771.998046875, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 861.154296875, + "completions/mean_terminated_length": 847.0810546875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.6814030895280362, - "grad_norm": 2.0844836235046387, - "kl": 5.75, - "learning_rate": 3.5100014124171995e-07, - "loss": 0.3838, - "num_tokens": 1089641050.0, - "reward": 1.833984375, - "reward_std": 0.5666632652282715, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.19297493994235992, + "grad_norm": 1.314761996269226, + "kl": 2.4921875, + "learning_rate": 3.511705318592504e-07, + "loss": 0.1224, + "num_tokens": 1124405703.0, + "reward": 1.0869140625, + "reward_std": 0.39862626791000366, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.1844143271446228, "step": 1996 }, { @@ -57899,27 +57899,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 791.677734375, - "completions/mean_terminated_length": 727.184814453125, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/max_terminated_length": 1971.0, + "completions/mean_length": 804.669921875, + "completions/mean_terminated_length": 784.9345703125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.681744473841427, - "grad_norm": 2.7286179065704346, - "kl": 6.75, - "learning_rate": 3.505194443062701e-07, - "loss": 0.4735, - "num_tokens": 1090122053.0, - "reward": 1.830078125, - "reward_std": 0.5621203780174255, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.22229015827178955, + "grad_norm": 2.5341320037841797, + "kl": 2.087890625, + "learning_rate": 3.506895524260426e-07, + "loss": 0.0712, + "num_tokens": 1124893358.0, + "reward": 1.1083984375, + "reward_std": 0.33458152413368225, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.15904124081134796, "step": 1997 }, { @@ -57928,27 +57928,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 729.859375, - "completions/mean_terminated_length": 695.51904296875, - "completions/min_length": 84.0, - "completions/min_terminated_length": 84.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 800.244140625, + "completions/mean_terminated_length": 777.91845703125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.6820858581548178, - "grad_norm": 3.886467695236206, - "kl": 4.31640625, - "learning_rate": 3.5003903049698356e-07, - "loss": 0.3389, - "num_tokens": 1090564541.0, - "reward": 1.96240234375, - "reward_std": 0.465962290763855, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.95263671875, - "rewards/tag_count_reward/std": 0.15188921988010406, + "grad_norm": 2.728328227996826, + "kl": 2.35546875, + "learning_rate": 3.502088560922335e-07, + "loss": 0.1507, + "num_tokens": 1125371883.0, + "reward": 1.130859375, + "reward_std": 0.33455926179885864, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.177639439702034, "step": 1998 }, { @@ -57957,27 +57957,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 778.80078125, - "completions/mean_terminated_length": 713.6468505859375, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 831.91796875, + "completions/mean_terminated_length": 817.498046875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.6824272424682086, - "grad_norm": 1.008063554763794, - "kl": 5.9375, - "learning_rate": 3.4955890049572e-07, - "loss": 0.3869, - "num_tokens": 1091043527.0, - "reward": 1.89892578125, - "reward_std": 0.5463054776191711, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.1966189742088318, + "grad_norm": 1.903712511062622, + "kl": 2.166015625, + "learning_rate": 3.4972844354060095e-07, + "loss": 0.0918, + "num_tokens": 1125878065.0, + "reward": 1.1298828125, + "reward_std": 0.38038915395736694, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.16586430370807648, "step": 1999 }, { @@ -57986,27 +57986,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1924.0, - "completions/mean_length": 779.732421875, - "completions/mean_terminated_length": 749.2940063476562, - "completions/min_length": 3.0, - "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 868.986328125, + "completions/mean_terminated_length": 847.890625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.6827686267815993, - "grad_norm": 1.1527817249298096, - "kl": 6.109375, - "learning_rate": 3.490790549839359e-07, - "loss": 0.388, - "num_tokens": 1091526270.0, - "reward": 1.80908203125, - "reward_std": 0.5509384274482727, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.2177731692790985, + "grad_norm": 2.7636938095092773, + "kl": 1.86328125, + "learning_rate": 3.492483154535205e-07, + "loss": 0.1129, + "num_tokens": 1126406506.0, + "reward": 1.0703125, + "reward_std": 0.34476834535598755, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.1697956770658493, "step": 2000 }, { @@ -58015,27 +58015,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 773.5, - "completions/mean_terminated_length": 748.111572265625, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 811.107421875, + "completions/mean_terminated_length": 803.8173217773438, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.6831100110949901, - "grad_norm": 1.2709134817123413, - "kl": 5.1953125, - "learning_rate": 3.4859949464268456e-07, - "loss": 0.3416, - "num_tokens": 1092006478.0, - "reward": 1.85986328125, - "reward_std": 0.5381577610969543, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.18997004628181458, + "grad_norm": 3.018387794494629, + "kl": 2.203125, + "learning_rate": 3.4876847251296287e-07, + "loss": 0.0932, + "num_tokens": 1126905969.0, + "reward": 1.0888671875, + "reward_std": 0.35073888301849365, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.16366049647331238, "step": 2001 }, { @@ -58044,27 +58044,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1870.0, - "completions/mean_length": 719.9140625, - "completions/mean_terminated_length": 690.7545166015625, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 816.5, + "completions/mean_terminated_length": 801.8972778320312, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.683451395408381, - "grad_norm": 1.2730306386947632, - "kl": 5.625, - "learning_rate": 3.481202201526136e-07, - "loss": 0.3509, - "num_tokens": 1092455394.0, - "reward": 1.849609375, - "reward_std": 0.5185894966125488, + "grad_norm": 3.1583056449890137, + "kl": 2.74609375, + "learning_rate": 3.4828891540049463e-07, + "loss": 0.1691, + "num_tokens": 1127404337.0, + "reward": 1.044921875, + "reward_std": 0.31024712324142456, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.18600594997406006, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.17759640514850616, "step": 2002 }, { @@ -58073,27 +58073,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 779.9140625, - "completions/mean_terminated_length": 728.3658447265625, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 817.423828125, + "completions/mean_terminated_length": 802.83203125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, "epoch": 0.6837927797217718, - "grad_norm": 1.204888939857483, - "kl": 7.76171875, - "learning_rate": 3.4764123219396613e-07, - "loss": 0.5224, - "num_tokens": 1092934246.0, - "reward": 1.86328125, - "reward_std": 0.552196741104126, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.1835651993751526, + "grad_norm": 2.3933656215667725, + "kl": 2.765625, + "learning_rate": 3.478096447972756e-07, + "loss": 0.1365, + "num_tokens": 1127902394.0, + "reward": 1.1455078125, + "reward_std": 0.3759608268737793, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.18583892285823822, "step": 2003 }, { @@ -58102,27 +58102,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 769.638671875, - "completions/mean_terminated_length": 736.3346557617188, - "completions/min_length": 21.0, - "completions/min_terminated_length": 21.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 804.73828125, + "completions/mean_terminated_length": 787.5050048828125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, "epoch": 0.6841341640351626, - "grad_norm": 1.5956668853759766, - "kl": 7.1171875, - "learning_rate": 3.471625314465773e-07, - "loss": 0.454, - "num_tokens": 1093408269.0, - "reward": 1.8251953125, - "reward_std": 0.5145806074142456, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.18887855112552643, + "grad_norm": 3.059457540512085, + "kl": 2.826171875, + "learning_rate": 3.473306613840589e-07, + "loss": 0.1479, + "num_tokens": 1128394388.0, + "reward": 1.03759765625, + "reward_std": 0.31256619095802307, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.1818806678056717, "step": 2004 }, { @@ -58131,27 +58131,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 769.146484375, - "completions/mean_terminated_length": 719.8600463867188, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 837.025390625, + "completions/mean_terminated_length": 815.3577880859375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.6844755483485534, - "grad_norm": 3.291630744934082, - "kl": 7.890625, - "learning_rate": 3.466841185898756e-07, - "loss": 0.4675, - "num_tokens": 1093893368.0, - "reward": 1.7783203125, - "reward_std": 0.5961418747901917, - "rewards/accuracy_reward/mean": 0.06854838877916336, - "rewards/accuracy_reward/std": 0.25293973088264465, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.9033203125, - "rewards/tag_count_reward/std": 0.21485605835914612, + "grad_norm": 2.3869965076446533, + "kl": 2.634765625, + "learning_rate": 3.4685196584119e-07, + "loss": 0.1271, + "num_tokens": 1128914241.0, + "reward": 1.0986328125, + "reward_std": 0.3809621334075928, + "rewards/accuracy_reward/mean": 0.10282257944345474, + "rewards/accuracy_reward/std": 0.30403366684913635, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.1859828382730484, "step": 2005 }, { @@ -58160,27 +58160,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 739.6171875, - "completions/mean_terminated_length": 694.682861328125, - "completions/min_length": 78.0, - "completions/min_terminated_length": 78.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 815.326171875, + "completions/mean_terminated_length": 788.261474609375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.6848169326619442, - "grad_norm": 2.7448348999023438, - "kl": 7.609375, - "learning_rate": 3.4620599430288077e-07, - "loss": 0.507, - "num_tokens": 1094349124.0, - "reward": 1.830078125, - "reward_std": 0.5540704727172852, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.2108539640903473, + "grad_norm": 3.5276033878326416, + "kl": 3.0625, + "learning_rate": 3.463735588486053e-07, + "loss": 0.1983, + "num_tokens": 1129408760.0, + "reward": 1.060546875, + "reward_std": 0.3475075960159302, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.18546061217784882, "step": 2006 }, { @@ -58189,27 +58189,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1865.0, - "completions/mean_length": 739.24609375, - "completions/mean_terminated_length": 688.8073120117188, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 849.634765625, + "completions/mean_terminated_length": 842.5717163085938, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.685158316975335, - "grad_norm": 3.8149802684783936, - "kl": 7.921875, - "learning_rate": 3.457281592642026e-07, - "loss": 0.4571, - "num_tokens": 1094811010.0, - "reward": 1.81787109375, - "reward_std": 0.542671799659729, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.20506852865219116, + "grad_norm": 3.876281976699829, + "kl": 3.0703125, + "learning_rate": 3.45895441085831e-07, + "loss": 0.1372, + "num_tokens": 1129927165.0, + "reward": 1.0859375, + "reward_std": 0.4188704490661621, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.1937359869480133, "step": 2007 }, { @@ -58218,27 +58218,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1850.0, - "completions/mean_length": 721.427734375, - "completions/mean_terminated_length": 667.5020141601562, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 814.70703125, + "completions/mean_terminated_length": 812.2935180664062, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.6854997012887258, - "grad_norm": 1.4170217514038086, - "kl": 6.5859375, - "learning_rate": 3.452506141520406e-07, - "loss": 0.4268, - "num_tokens": 1095256397.0, - "reward": 1.90576171875, - "reward_std": 0.5055143237113953, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.19025151431560516, + "grad_norm": 2.854119062423706, + "kl": 2.578125, + "learning_rate": 3.4541761323198295e-07, + "loss": 0.1152, + "num_tokens": 1130420311.0, + "reward": 1.13330078125, + "reward_std": 0.35503774881362915, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.16971616446971893, "step": 2008 }, { @@ -58247,27 +58247,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 699.96484375, - "completions/mean_terminated_length": 673.111572265625, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 791.271484375, + "completions/mean_terminated_length": 781.3759765625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.6858410856021165, - "grad_norm": 1.7245264053344727, - "kl": 5.9765625, - "learning_rate": 3.4477335964418237e-07, - "loss": 0.3983, - "num_tokens": 1095699547.0, - "reward": 1.82568359375, - "reward_std": 0.5196582078933716, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.1900404542684555, + "grad_norm": 3.0499801635742188, + "kl": 2.15234375, + "learning_rate": 3.449400759657653e-07, + "loss": 0.1278, + "num_tokens": 1130910210.0, + "reward": 1.0634765625, + "reward_std": 0.3152570128440857, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.1709705889225006, "step": 2009 }, { @@ -58276,27 +58276,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 776.666015625, - "completions/mean_terminated_length": 727.6693725585938, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 834.228515625, + "completions/mean_terminated_length": 822.2584228515625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.6861824699155074, - "grad_norm": 1.3633527755737305, - "kl": 6.90625, - "learning_rate": 3.442963964180039e-07, - "loss": 0.4468, - "num_tokens": 1096177952.0, - "reward": 1.81787109375, - "reward_std": 0.5621967315673828, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.21267634630203247, + "grad_norm": 2.3293509483337402, + "kl": 2.64453125, + "learning_rate": 3.4446282996546853e-07, + "loss": 0.1461, + "num_tokens": 1131418087.0, + "reward": 1.06298828125, + "reward_std": 0.3098532259464264, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.17743425071239471, "step": 2010 }, { @@ -58305,27 +58305,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 771.876953125, - "completions/mean_terminated_length": 733.3621215820312, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 884.306640625, + "completions/mean_terminated_length": 872.8303833007812, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.6865238542288982, - "grad_norm": 2.924910545349121, - "kl": 5.1171875, - "learning_rate": 3.4381972515046675e-07, - "loss": 0.3663, - "num_tokens": 1096653425.0, - "reward": 1.86083984375, - "reward_std": 0.5203917026519775, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.18779471516609192, + "grad_norm": 2.228442668914795, + "kl": 1.83203125, + "learning_rate": 3.439858759089709e-07, + "loss": 0.0698, + "num_tokens": 1131951124.0, + "reward": 1.12939453125, + "reward_std": 0.3936542272567749, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.091796875, + "rewards/format_reward/std": 0.289021372795105, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.16605646908283234, "step": 2011 }, { @@ -58334,27 +58334,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 775.73828125, - "completions/mean_terminated_length": 732.0444946289062, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 853.876953125, + "completions/mean_terminated_length": 834.9226684570312, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.686865238542289, - "grad_norm": 3.1730332374572754, - "kl": 5.5703125, - "learning_rate": 3.4334334651811895e-07, - "loss": 0.4103, - "num_tokens": 1097123883.0, - "reward": 1.8857421875, - "reward_std": 0.4755280613899231, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.17727059125900269, + "grad_norm": 2.3702590465545654, + "kl": 1.798828125, + "learning_rate": 3.435092144737346e-07, + "loss": 0.0937, + "num_tokens": 1132461589.0, + "reward": 1.068359375, + "reward_std": 0.3254043161869049, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.935546875, + "rewards/tag_count_reward/std": 0.17274148762226105, "step": 2012 }, { @@ -58363,27 +58363,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1900.0, - "completions/mean_length": 767.333984375, - "completions/mean_terminated_length": 717.9776611328125, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 827.44140625, + "completions/mean_terminated_length": 817.8306884765625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.6872066228556798, - "grad_norm": 1.8164947032928467, - "kl": 7.328125, - "learning_rate": 3.428672611970931e-07, - "loss": 0.4735, - "num_tokens": 1097602390.0, - "reward": 1.853515625, - "reward_std": 0.5261906981468201, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.19872237741947174, + "grad_norm": 2.512019634246826, + "kl": 2.55078125, + "learning_rate": 3.4303284633680716e-07, + "loss": 0.1226, + "num_tokens": 1132970871.0, + "reward": 1.05908203125, + "reward_std": 0.3575366735458374, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.19964765012264252, "step": 2013 }, { @@ -58392,27 +58392,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1833.0, - "completions/mean_length": 779.359375, - "completions/mean_terminated_length": 722.3999633789062, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 874.576171875, + "completions/mean_terminated_length": 853.5805053710938, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.6875480071690706, - "grad_norm": 1.9302737712860107, - "kl": 6.7109375, - "learning_rate": 3.423914698631044e-07, - "loss": 0.4349, - "num_tokens": 1098079854.0, - "reward": 1.85888671875, - "reward_std": 0.5373066663742065, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.20597051084041595, + "grad_norm": 1.6990776062011719, + "kl": 2.140625, + "learning_rate": 3.425567721748187e-07, + "loss": 0.1304, + "num_tokens": 1133497086.0, + "reward": 1.0947265625, + "reward_std": 0.3705121576786041, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.9189453125, + "rewards/tag_count_reward/std": 0.19288331270217896, "step": 2014 }, { @@ -58421,27 +58421,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 826.439453125, - "completions/mean_terminated_length": 784.4868774414062, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 870.970703125, + "completions/mean_terminated_length": 849.9105224609375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.6878893914824614, - "grad_norm": 1.0338008403778076, - "kl": 7.3125, - "learning_rate": 3.4191597319145246e-07, - "loss": 0.47, - "num_tokens": 1098572783.0, - "reward": 1.82373046875, - "reward_std": 0.5456644892692566, - "rewards/accuracy_reward/mean": 0.052419353276491165, - "rewards/accuracy_reward/std": 0.22309619188308716, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.20463472604751587, + "grad_norm": 7.498424530029297, + "kl": 2.765625, + "learning_rate": 3.420809926639825e-07, + "loss": 0.2008, + "num_tokens": 1134012815.0, + "reward": 0.99072265625, + "reward_std": 0.3221360445022583, + "rewards/accuracy_reward/mean": 0.04838709533214569, + "rewards/accuracy_reward/std": 0.21479946374893188, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.90673828125, + "rewards/tag_count_reward/std": 0.20917905867099762, "step": 2015 }, { @@ -58450,27 +58450,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 721.32421875, - "completions/mean_terminated_length": 670.1947021484375, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 798.990234375, + "completions/mean_terminated_length": 786.672607421875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.6882307757958522, - "grad_norm": 1.4855358600616455, - "kl": 6.5078125, - "learning_rate": 3.414407718570172e-07, - "loss": 0.431, - "num_tokens": 1099023829.0, - "reward": 1.89501953125, - "reward_std": 0.45963796973228455, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17533229291439056, + "grad_norm": 15.323307037353516, + "kl": 2.36328125, + "learning_rate": 3.416055084800927e-07, + "loss": 0.0926, + "num_tokens": 1134503626.0, + "reward": 1.0966796875, + "reward_std": 0.3540540337562561, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.17308135330677032, "step": 2016 }, { @@ -58479,27 +58479,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 802.76171875, - "completions/mean_terminated_length": 741.5204467773438, - "completions/min_length": 115.0, - "completions/min_terminated_length": 115.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 831.29296875, + "completions/mean_terminated_length": 816.8656616210938, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.6885721601092429, - "grad_norm": 2.0285561084747314, - "kl": 8.4453125, - "learning_rate": 3.4096586653426053e-07, - "loss": 0.5113, - "num_tokens": 1099513595.0, - "reward": 1.84619140625, - "reward_std": 0.5822349786758423, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.21884989738464355, + "grad_norm": 5.637637138366699, + "kl": 1.76953125, + "learning_rate": 3.411303202985245e-07, + "loss": 0.0625, + "num_tokens": 1135008000.0, + "reward": 1.1337890625, + "reward_std": 0.378518670797348, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.17297089099884033, "step": 2017 }, { @@ -58508,27 +58508,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 764.267578125, - "completions/mean_terminated_length": 722.8568115234375, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 833.11328125, + "completions/mean_terminated_length": 828.3490600585938, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.6889135444226338, - "grad_norm": 1.34463369846344, - "kl": 7.734375, - "learning_rate": 3.404912578972232e-07, - "loss": 0.4943, - "num_tokens": 1099979332.0, - "reward": 1.83740234375, - "reward_std": 0.5318292379379272, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19512130320072174, + "grad_norm": 3.005314826965332, + "kl": 1.98046875, + "learning_rate": 3.406554287942324e-07, + "loss": 0.0838, + "num_tokens": 1135508986.0, + "reward": 1.107421875, + "reward_std": 0.36660492420196533, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.18546061217784882, "step": 2018 }, { @@ -58537,27 +58537,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 798.251953125, - "completions/mean_terminated_length": 757.9375, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 851.09765625, + "completions/mean_terminated_length": 844.0432739257812, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, "epoch": 0.6892549287360246, - "grad_norm": 1.9976649284362793, - "kl": 6.140625, - "learning_rate": 3.400169466195256e-07, - "loss": 0.3996, - "num_tokens": 1100474757.0, - "reward": 1.87255859375, - "reward_std": 0.5219699144363403, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.19719645380973816, + "grad_norm": 2.1439764499664307, + "kl": 1.560546875, + "learning_rate": 3.401808346417495e-07, + "loss": 0.0628, + "num_tokens": 1136031468.0, + "reward": 1.076171875, + "reward_std": 0.2905398905277252, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.15384027361869812, "step": 2019 }, { @@ -58566,27 +58566,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 777.501953125, - "completions/mean_terminated_length": 720.4591674804688, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 875.466796875, + "completions/mean_terminated_length": 861.5632934570312, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, "epoch": 0.6895963130494154, - "grad_norm": 1.4548605680465698, - "kl": 7.53125, - "learning_rate": 3.395429333743663e-07, - "loss": 0.4906, - "num_tokens": 1100941478.0, - "reward": 1.81103515625, - "reward_std": 0.5591021776199341, - "rewards/accuracy_reward/mean": 0.05645161122083664, - "rewards/accuracy_reward/std": 0.23102475702762604, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.21399766206741333, + "grad_norm": 3.358839273452759, + "kl": 2.919921875, + "learning_rate": 3.3970653851518657e-07, + "loss": 0.139, + "num_tokens": 1136548347.0, + "reward": 1.04443359375, + "reward_std": 0.36828118562698364, + "rewards/accuracy_reward/mean": 0.07056451588869095, + "rewards/accuracy_reward/std": 0.25635457038879395, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.1953953504562378, "step": 2020 }, { @@ -58595,27 +58595,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1922.0, - "completions/mean_length": 772.6328125, - "completions/mean_terminated_length": 734.1408081054688, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 858.6953125, + "completions/mean_terminated_length": 844.5928955078125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, "epoch": 0.6899376973628062, - "grad_norm": 1.5992845296859741, - "kl": 5.578125, - "learning_rate": 3.3906921883451957e-07, - "loss": 0.3679, - "num_tokens": 1101418618.0, - "reward": 1.93408203125, - "reward_std": 0.5032426118850708, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.93994140625, - "rewards/tag_count_reward/std": 0.18288137018680573, + "grad_norm": 2.7229037284851074, + "kl": 2.23046875, + "learning_rate": 3.3923254108823114e-07, + "loss": 0.1098, + "num_tokens": 1137069551.0, + "reward": 1.12451171875, + "reward_std": 0.38507571816444397, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.17503775656223297, "step": 2021 }, { @@ -58624,27 +58624,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1930.0, - "completions/mean_length": 774.623046875, - "completions/mean_terminated_length": 736.1911010742188, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 823.71875, + "completions/mean_terminated_length": 814.0787353515625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.690279081676197, - "grad_norm": 1.1282154321670532, - "kl": 5.109375, - "learning_rate": 3.3859580367233695e-07, - "loss": 0.3283, - "num_tokens": 1101886553.0, - "reward": 1.87548828125, - "reward_std": 0.41805973649024963, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17322689294815063, + "grad_norm": 2.605647325515747, + "kl": 2.232421875, + "learning_rate": 3.387588430341461e-07, + "loss": 0.0925, + "num_tokens": 1137562623.0, + "reward": 1.05224609375, + "reward_std": 0.32912611961364746, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.1861388236284256, "step": 2022 }, { @@ -58653,27 +58653,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 780.873046875, - "completions/mean_terminated_length": 739.9979858398438, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 823.99609375, + "completions/mean_terminated_length": 811.925048828125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, "epoch": 0.6906204659895878, - "grad_norm": 1.2273058891296387, - "kl": 5.2109375, - "learning_rate": 3.3812268855974475e-07, - "loss": 0.383, - "num_tokens": 1102353176.0, - "reward": 1.93115234375, - "reward_std": 0.3770022392272949, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.9140625, - "rewards/format_reward/std": 0.28054583072662354, - "rewards/tag_count_reward/mean": 0.95458984375, - "rewards/tag_count_reward/std": 0.1603061705827713, + "grad_norm": 2.058511972427368, + "kl": 2.2421875, + "learning_rate": 3.382854450257696e-07, + "loss": 0.1134, + "num_tokens": 1138051325.0, + "reward": 1.0810546875, + "reward_std": 0.3180699646472931, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.16057194769382477, "step": 2023 }, { @@ -58682,27 +58682,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1969.0, - "completions/mean_length": 712.54296875, - "completions/mean_terminated_length": 683.2215576171875, - "completions/min_length": 74.0, - "completions/min_terminated_length": 74.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 774.353515625, + "completions/mean_terminated_length": 769.35888671875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.6909618503029786, - "grad_norm": 1.0310503244400024, - "kl": 4.8515625, - "learning_rate": 3.376498741682433e-07, - "loss": 0.3448, - "num_tokens": 1102796814.0, - "reward": 1.96044921875, - "reward_std": 0.45979851484298706, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.91015625, - "rewards/format_reward/std": 0.2862374484539032, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.18106979131698608, + "grad_norm": 3.8489980697631836, + "kl": 1.998046875, + "learning_rate": 3.378123477355135e-07, + "loss": 0.1424, + "num_tokens": 1138526610.0, + "reward": 1.138671875, + "reward_std": 0.3458394408226013, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.15359161794185638, "step": 2024 }, { @@ -58711,27 +58711,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1843.0, - "completions/mean_length": 710.173828125, - "completions/mean_terminated_length": 694.310302734375, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 860.919921875, + "completions/mean_terminated_length": 842.0774536132812, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.6913032346163693, - "grad_norm": 1.1273776292800903, - "kl": 3.265625, - "learning_rate": 3.3717736116890585e-07, - "loss": 0.2073, - "num_tokens": 1103243111.0, - "reward": 1.93359375, - "reward_std": 0.33209705352783203, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.9296875, - "rewards/format_reward/std": 0.25592297315597534, - "rewards/tag_count_reward/mean": 0.96484375, - "rewards/tag_count_reward/std": 0.14070820808410645, + "grad_norm": 5.384428977966309, + "kl": 2.55078125, + "learning_rate": 3.373395518353625e-07, + "loss": 0.0867, + "num_tokens": 1139050089.0, + "reward": 1.0693359375, + "reward_std": 0.357952743768692, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.18308307230472565, "step": 2025 }, { @@ -58740,27 +58740,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 705.255859375, - "completions/mean_terminated_length": 692.0137939453125, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 789.62109375, + "completions/mean_terminated_length": 782.204345703125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, "epoch": 0.6916446189297601, - "grad_norm": 1.2359204292297363, - "kl": 4.28125, - "learning_rate": 3.3670515023237866e-07, - "loss": 0.2912, - "num_tokens": 1103679482.0, - "reward": 1.98388671875, - "reward_std": 0.459837943315506, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.95263671875, - "rewards/tag_count_reward/std": 0.15897136926651, + "grad_norm": 2.204078435897827, + "kl": 2.310546875, + "learning_rate": 3.3686705799687285e-07, + "loss": 0.1235, + "num_tokens": 1139529655.0, + "reward": 1.13232421875, + "reward_std": 0.37530261278152466, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.1749831587076187, "step": 2026 }, { @@ -58769,27 +58769,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1940.0, - "completions/mean_length": 768.39453125, - "completions/mean_terminated_length": 729.7745971679688, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 892.69921875, + "completions/mean_terminated_length": 869.685302734375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.691986003243151, - "grad_norm": 1.2850275039672852, - "kl": 4.8046875, - "learning_rate": 3.362332420288786e-07, - "loss": 0.3307, - "num_tokens": 1104144388.0, - "reward": 1.9501953125, - "reward_std": 0.4600157141685486, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.912109375, - "rewards/format_reward/std": 0.2834126651287079, - "rewards/tag_count_reward/mean": 0.9541015625, - "rewards/tag_count_reward/std": 0.1551237553358078, + "grad_norm": 4.572418212890625, + "kl": 2.7109375, + "learning_rate": 3.3639486689117234e-07, + "loss": 0.0986, + "num_tokens": 1140058205.0, + "reward": 1.08740234375, + "reward_std": 0.39657920598983765, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.18427632749080658, "step": 2027 }, { @@ -58798,27 +58798,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1953.0, - "completions/mean_length": 750.505859375, - "completions/mean_terminated_length": 724.6593627929688, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 855.8984375, + "completions/mean_terminated_length": 824.8416748046875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.6923273875565418, - "grad_norm": 1.6819928884506226, - "kl": 4.51171875, - "learning_rate": 3.3576163722819273e-07, - "loss": 0.3185, - "num_tokens": 1104615527.0, - "reward": 1.90869140625, - "reward_std": 0.3826946020126343, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.91796875, - "rewards/format_reward/std": 0.2746807038784027, - "rewards/tag_count_reward/mean": 0.95166015625, - "rewards/tag_count_reward/std": 0.15944552421569824, + "grad_norm": 7.1265950202941895, + "kl": 2.912109375, + "learning_rate": 3.3592297918895824e-07, + "loss": 0.26, + "num_tokens": 1140583305.0, + "reward": 1.02294921875, + "reward_std": 0.30741724371910095, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.025390625, + "rewards/format_reward/std": 0.15746226906776428, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.19118840992450714, "step": 2028 }, { @@ -58827,27 +58827,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 725.001953125, - "completions/mean_terminated_length": 706.6633911132812, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 871.46875, + "completions/mean_terminated_length": 850.41748046875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, "epoch": 0.6926687718699326, - "grad_norm": 1.4650315046310425, - "kl": 4.1171875, - "learning_rate": 3.3529033649967843e-07, - "loss": 0.2806, - "num_tokens": 1105076824.0, - "reward": 1.94873046875, - "reward_std": 0.3908160626888275, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.916015625, - "rewards/format_reward/std": 0.2776356339454651, - "rewards/tag_count_reward/mean": 0.95263671875, - "rewards/tag_count_reward/std": 0.15742509067058563, + "grad_norm": 4.3199663162231445, + "kl": 2.412109375, + "learning_rate": 3.354513955604971e-07, + "loss": 0.1707, + "num_tokens": 1141119593.0, + "reward": 1.06103515625, + "reward_std": 0.34209397435188293, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.18718312680721283, "step": 2029 }, { @@ -58856,27 +58856,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 714.0, - "completions/mean_terminated_length": 698.1818237304688, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 847.966796875, + "completions/mean_terminated_length": 838.5177001953125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.6930101561833234, - "grad_norm": 1.4474140405654907, - "kl": 4.62109375, - "learning_rate": 3.3481934051226024e-07, - "loss": 0.2668, - "num_tokens": 1105517496.0, - "reward": 1.9580078125, - "reward_std": 0.4199273884296417, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.912109375, - "rewards/format_reward/std": 0.2834126651287079, - "rewards/tag_count_reward/mean": 0.9580078125, - "rewards/tag_count_reward/std": 0.14736543595790863, + "grad_norm": 2.2698123455047607, + "kl": 2.1640625, + "learning_rate": 3.3498011667562365e-07, + "loss": 0.0946, + "num_tokens": 1141628856.0, + "reward": 1.09228515625, + "reward_std": 0.3573184609413147, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17875948548316956, "step": 2030 }, { @@ -58885,27 +58885,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1916.0, - "completions/mean_length": 749.974609375, - "completions/mean_terminated_length": 729.37109375, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 878.076171875, + "completions/mean_terminated_length": 864.20361328125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.6933515404967142, - "grad_norm": 2.888395309448242, - "kl": 5.88671875, - "learning_rate": 3.3434864993443123e-07, - "loss": 0.3294, - "num_tokens": 1105984731.0, - "reward": 1.89697265625, - "reward_std": 0.4438630938529968, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.900390625, - "rewards/format_reward/std": 0.29977133870124817, - "rewards/tag_count_reward/mean": 0.94775390625, - "rewards/tag_count_reward/std": 0.16795603930950165, + "grad_norm": 2.515773057937622, + "kl": 2.22265625, + "learning_rate": 3.345091432037398e-07, + "loss": 0.0831, + "num_tokens": 1142161679.0, + "reward": 1.044921875, + "reward_std": 0.3244918882846832, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.16988569498062134, "step": 2031 }, { @@ -58914,27 +58914,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 708.150390625, - "completions/mean_terminated_length": 684.1768798828125, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 811.298828125, + "completions/mean_terminated_length": 799.1026000976562, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.693692924810105, - "grad_norm": 3.6145339012145996, - "kl": 6.3203125, - "learning_rate": 3.338782654342506e-07, - "loss": 0.3938, - "num_tokens": 1106412920.0, - "reward": 1.9541015625, - "reward_std": 0.42945945262908936, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.91796875, - "rewards/format_reward/std": 0.2746807038784027, - "rewards/tag_count_reward/mean": 0.9521484375, - "rewards/tag_count_reward/std": 0.15997575223445892, + "grad_norm": 6.9392876625061035, + "kl": 3.17578125, + "learning_rate": 3.340384758138133e-07, + "loss": 0.1461, + "num_tokens": 1142642680.0, + "reward": 1.11181640625, + "reward_std": 0.3710258901119232, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.18333014845848083, "step": 2032 }, { @@ -58943,27 +58943,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 780.103515625, - "completions/mean_terminated_length": 744.4598388671875, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 895.619140625, + "completions/mean_terminated_length": 884.2544555664062, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.6940343091234957, - "grad_norm": 8.065526962280273, - "kl": 7.828125, - "learning_rate": 3.334081876793427e-07, - "loss": 0.473, - "num_tokens": 1106899821.0, - "reward": 1.8876953125, - "reward_std": 0.48659005761146545, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.18863557279109955, + "grad_norm": 5.473264217376709, + "kl": 1.9140625, + "learning_rate": 3.335681151743775e-07, + "loss": 0.0544, + "num_tokens": 1143188725.0, + "reward": 1.10107421875, + "reward_std": 0.34152883291244507, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.17291219532489777, "step": 2033 }, { @@ -58972,27 +58972,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 732.80859375, - "completions/mean_terminated_length": 701.2440185546875, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 881.931640625, + "completions/mean_terminated_length": 870.4319458007812, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.6943756934368865, - "grad_norm": 4.281768798828125, - "kl": 8.1953125, - "learning_rate": 3.3293841733689745e-07, - "loss": 0.4801, - "num_tokens": 1107347147.0, - "reward": 1.935546875, - "reward_std": 0.44750791788101196, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.1660744547843933, + "grad_norm": 3.096057653427124, + "kl": 1.7578125, + "learning_rate": 3.3309806195352976e-07, + "loss": 0.1026, + "num_tokens": 1143712402.0, + "reward": 1.0810546875, + "reward_std": 0.2960221469402313, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.1606195569038391, "step": 2034 }, { @@ -59001,27 +59001,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 717.529296875, - "completions/mean_terminated_length": 691.0259399414062, - "completions/min_length": 14.0, - "completions/min_terminated_length": 14.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 811.228515625, + "completions/mean_terminated_length": 794.0851440429688, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.6947170777502774, - "grad_norm": 5.1386308670043945, - "kl": 8.859375, - "learning_rate": 3.324689550736674e-07, - "loss": 0.5111, - "num_tokens": 1107793066.0, - "reward": 1.9365234375, - "reward_std": 0.5220915079116821, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.1806451380252838, + "grad_norm": 2.296532392501831, + "kl": 1.7421875, + "learning_rate": 3.32628316818931e-07, + "loss": 0.0828, + "num_tokens": 1144206295.0, + "reward": 1.1875, + "reward_std": 0.3502257764339447, + "rewards/accuracy_reward/mean": 0.18359375, + "rewards/accuracy_reward/std": 0.3875311613082886, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.951171875, + "rewards/tag_count_reward/std": 0.1493907868862152, "step": 2035 }, { @@ -59030,27 +59030,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 790.515625, - "completions/mean_terminated_length": 755.1646118164062, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 931.650390625, + "completions/mean_terminated_length": 909.412353515625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.6950584620636682, - "grad_norm": 6.330515384674072, - "kl": 9.7578125, - "learning_rate": 3.3199980155596895e-07, - "loss": 0.5453, - "num_tokens": 1108272530.0, - "reward": 1.76953125, - "reward_std": 0.6054547429084778, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.22338789701461792, + "grad_norm": 3.3257064819335938, + "kl": 2.67578125, + "learning_rate": 3.3215888043780453e-07, + "loss": 0.1469, + "num_tokens": 1144758020.0, + "reward": 1.01416015625, + "reward_std": 0.33109569549560547, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.19656065106391907, "step": 2036 }, { @@ -59059,27 +59059,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1954.0, - "completions/mean_length": 789.607421875, - "completions/mean_terminated_length": 743.7550659179688, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 906.294921875, + "completions/mean_terminated_length": 899.5658569335938, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.695399846377059, - "grad_norm": 4.596363067626953, - "kl": 8.5546875, - "learning_rate": 3.315309574496792e-07, - "loss": 0.4735, - "num_tokens": 1108760777.0, - "reward": 1.80078125, - "reward_std": 0.5478556752204895, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.2041865587234497, + "grad_norm": 4.429315567016602, + "kl": 2.109375, + "learning_rate": 3.3168975347693517e-07, + "loss": 0.148, + "num_tokens": 1145306011.0, + "reward": 1.0380859375, + "reward_std": 0.319455087184906, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.18822988867759705, "step": 2037 }, { @@ -59088,27 +59088,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 757.49609375, - "completions/mean_terminated_length": 726.5240478515625, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 873.107421875, + "completions/mean_terminated_length": 856.8218383789062, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.6957412306904498, - "grad_norm": 3.411527633666992, - "kl": 7.25, - "learning_rate": 3.310624234202369e-07, - "loss": 0.408, - "num_tokens": 1109224919.0, - "reward": 1.82080078125, - "reward_std": 0.5130316019058228, - "rewards/accuracy_reward/mean": 0.0463709682226181, - "rewards/accuracy_reward/std": 0.21049949526786804, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.18794220685958862, + "grad_norm": 2.7152650356292725, + "kl": 2.84765625, + "learning_rate": 3.3122093660266794e-07, + "loss": 0.1604, + "num_tokens": 1145829346.0, + "reward": 1.02783203125, + "reward_std": 0.32627755403518677, + "rewards/accuracy_reward/mean": 0.07056451588869095, + "rewards/accuracy_reward/std": 0.25635460019111633, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.18998514115810394, "step": 2038 }, { @@ -59117,27 +59117,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 754.6484375, - "completions/mean_terminated_length": 723.6080322265625, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 799.921875, + "completions/mean_terminated_length": 782.621826171875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.6960826150038406, - "grad_norm": 3.268136739730835, - "kl": 8.4296875, - "learning_rate": 3.305942001326404e-07, - "loss": 0.4984, - "num_tokens": 1109692083.0, - "reward": 1.8564453125, - "reward_std": 0.6011396646499634, - "rewards/accuracy_reward/mean": 0.10282257944345474, - "rewards/accuracy_reward/std": 0.30403366684913635, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.19744645059108734, + "grad_norm": 2.535181999206543, + "kl": 1.994140625, + "learning_rate": 3.3075243048090766e-07, + "loss": 0.1136, + "num_tokens": 1146319690.0, + "reward": 1.15576171875, + "reward_std": 0.34523525834083557, + "rewards/accuracy_reward/mean": 0.16532258689403534, + "rewards/accuracy_reward/std": 0.37184643745422363, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.16188396513462067, "step": 2039 }, { @@ -59146,27 +59146,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1963.0, - "completions/mean_length": 771.66796875, - "completions/mean_terminated_length": 741.0360107421875, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, - "epoch": 0.6964239993172314, - "grad_norm": 1.0883808135986328, - "kl": 6.703125, - "learning_rate": 3.3012628825144685e-07, - "loss": 0.4257, - "num_tokens": 1110164665.0, - "reward": 1.8818359375, - "reward_std": 0.5368679165840149, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.9306640625, - "rewards/tag_count_reward/std": 0.19238728284835815, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 851.9921875, + "completions/mean_terminated_length": 837.810302734375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.6964239993172314, + "grad_norm": 2.6821112632751465, + "kl": 2.62109375, + "learning_rate": 3.3028423577711755e-07, + "loss": 0.1409, + "num_tokens": 1146833398.0, + "reward": 1.09521484375, + "reward_std": 0.35190898180007935, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.92919921875, + "rewards/tag_count_reward/std": 0.18237383663654327, "step": 2040 }, { @@ -59175,27 +59175,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 771.451171875, - "completions/mean_terminated_length": 740.8140258789062, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 899.22265625, + "completions/mean_terminated_length": 885.600830078125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, "epoch": 0.6967653836306221, - "grad_norm": 1.7109276056289673, - "kl": 5.6015625, - "learning_rate": 3.296586884407717e-07, - "loss": 0.3596, - "num_tokens": 1110641136.0, - "reward": 1.8779296875, - "reward_std": 0.4632631838321686, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.17864517867565155, + "grad_norm": 3.7952985763549805, + "kl": 2.0546875, + "learning_rate": 3.2981635315631885e-07, + "loss": 0.0854, + "num_tokens": 1147375288.0, + "reward": 1.08984375, + "reward_std": 0.31668949127197266, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.15118376910686493, "step": 2041 }, { @@ -59204,27 +59204,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1914.0, - "completions/mean_length": 747.58203125, - "completions/mean_terminated_length": 711.0240478515625, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 842.53515625, + "completions/mean_terminated_length": 835.4302978515625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, "epoch": 0.6971067679440129, - "grad_norm": 2.1707868576049805, - "kl": 4.99609375, - "learning_rate": 3.2919140136428727e-07, - "loss": 0.2871, - "num_tokens": 1111105162.0, - "reward": 1.8544921875, - "reward_std": 0.5191654562950134, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.192208394408226, + "grad_norm": 1.8702855110168457, + "kl": 1.96875, + "learning_rate": 3.293487832830891e-07, + "loss": 0.0994, + "num_tokens": 1147887930.0, + "reward": 1.1171875, + "reward_std": 0.306997686624527, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.947265625, + "rewards/tag_count_reward/std": 0.15920037031173706, "step": 2042 }, { @@ -59233,27 +59233,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1916.0, - "completions/mean_length": 781.337890625, - "completions/mean_terminated_length": 756.1055908203125, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 863.55078125, + "completions/mean_terminated_length": 847.1326904296875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.6974481522574038, - "grad_norm": 1.295634388923645, - "kl": 4.94140625, - "learning_rate": 3.287244276852223e-07, - "loss": 0.3002, - "num_tokens": 1111578871.0, - "reward": 1.88818359375, - "reward_std": 0.5161925554275513, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.1798466295003891, + "grad_norm": 3.088501453399658, + "kl": 2.53515625, + "learning_rate": 3.288815268215622e-07, + "loss": 0.1548, + "num_tokens": 1148403732.0, + "reward": 1.07421875, + "reward_std": 0.30261605978012085, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.18501698970794678, "step": 2043 }, { @@ -59262,27 +59262,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1916.0, - "completions/mean_length": 815.978515625, - "completions/mean_terminated_length": 778.7947387695312, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 899.251953125, + "completions/mean_terminated_length": 890.2066650390625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.6977895365707946, - "grad_norm": 3.391709804534912, - "kl": 4.97265625, - "learning_rate": 3.282577680663604e-07, - "loss": 0.3362, - "num_tokens": 1112071068.0, - "reward": 1.82861328125, - "reward_std": 0.5299324989318848, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.1867896467447281, + "grad_norm": 3.1716232299804688, + "kl": 1.78125, + "learning_rate": 3.2841458443542604e-07, + "loss": 0.0997, + "num_tokens": 1148938565.0, + "reward": 1.0849609375, + "reward_std": 0.34739550948143005, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.1666000783443451, "step": 2044 }, { @@ -59291,27 +59291,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1923.0, - "completions/mean_length": 809.005859375, - "completions/mean_terminated_length": 763.8603515625, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 882.08984375, + "completions/mean_terminated_length": 870.5917358398438, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, "epoch": 0.6981309208841854, - "grad_norm": 4.1205902099609375, - "kl": 4.71875, - "learning_rate": 3.277914231700393e-07, - "loss": 0.3509, - "num_tokens": 1112566815.0, - "reward": 1.84375, - "reward_std": 0.5765846967697144, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.1967410445213318, + "grad_norm": 4.9459614753723145, + "kl": 2.212890625, + "learning_rate": 3.279479567879232e-07, + "loss": 0.1019, + "num_tokens": 1149471731.0, + "reward": 1.10107421875, + "reward_std": 0.33557015657424927, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.17007611691951752, "step": 2045 }, { @@ -59320,27 +59320,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1949.0, - "completions/mean_length": 825.34375, - "completions/mean_terminated_length": 746.544677734375, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 933.83203125, + "completions/mean_terminated_length": 909.3692626953125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.6984723051975762, - "grad_norm": 1.545742392539978, - "kl": 6.078125, - "learning_rate": 3.273253936581506e-07, - "loss": 0.3816, - "num_tokens": 1113069775.0, - "reward": 1.7470703125, - "reward_std": 0.5435930490493774, - "rewards/accuracy_reward/mean": 0.04435483738780022, - "rewards/accuracy_reward/std": 0.2060900777578354, - "rewards/format_reward/mean": 0.8046875, - "rewards/format_reward/std": 0.3968288004398346, - "rewards/tag_count_reward/mean": 0.8994140625, - "rewards/tag_count_reward/std": 0.22909599542617798, + "grad_norm": 1.4658163785934448, + "kl": 2.37109375, + "learning_rate": 3.2748164454184867e-07, + "loss": 0.1249, + "num_tokens": 1150030237.0, + "reward": 1.04833984375, + "reward_std": 0.33467191457748413, + "rewards/accuracy_reward/mean": 0.060483869165182114, + "rewards/accuracy_reward/std": 0.2386218160390854, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.1804301142692566, "step": 2046 }, { @@ -59349,27 +59349,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 800.427734375, - "completions/mean_terminated_length": 747.0692749023438, - "completions/min_length": 82.0, - "completions/min_terminated_length": 82.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 899.19140625, + "completions/mean_terminated_length": 873.9680786132812, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, "epoch": 0.698813689510967, - "grad_norm": 2.848963737487793, - "kl": 5.6953125, - "learning_rate": 3.2685968019213784e-07, - "loss": 0.4033, - "num_tokens": 1113552778.0, - "reward": 1.802734375, - "reward_std": 0.5537554025650024, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.19890500605106354, + "grad_norm": 1.4968277215957642, + "kl": 2.8984375, + "learning_rate": 3.270156483595496e-07, + "loss": 0.1856, + "num_tokens": 1150563807.0, + "reward": 1.02490234375, + "reward_std": 0.33558905124664307, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.19357748329639435, "step": 2047 }, { @@ -59378,27 +59378,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1913.0, - "completions/mean_length": 812.1015625, - "completions/mean_terminated_length": 759.2423706054688, - "completions/min_length": 28.0, - "completions/min_terminated_length": 28.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 866.634765625, + "completions/mean_terminated_length": 847.8829956054688, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.6991550738243578, - "grad_norm": 1.1179615259170532, - "kl": 7.1015625, - "learning_rate": 3.2639428343299623e-07, - "loss": 0.4567, - "num_tokens": 1114042814.0, - "reward": 1.76806640625, - "reward_std": 0.5800941586494446, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.90869140625, - "rewards/tag_count_reward/std": 0.21235711872577667, + "grad_norm": 2.8308959007263184, + "kl": 1.94921875, + "learning_rate": 3.2654996890292453e-07, + "loss": 0.0786, + "num_tokens": 1151081764.0, + "reward": 1.05419921875, + "reward_std": 0.24821466207504272, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.94287109375, + "rewards/tag_count_reward/std": 0.1663554310798645, "step": 2048 }, { @@ -59407,27 +59407,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1941.0, - "completions/mean_length": 802.75390625, - "completions/mean_terminated_length": 752.1340942382812, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 871.810546875, + "completions/mean_terminated_length": 855.5069580078125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.6994964581377485, - "grad_norm": 0.9869757890701294, - "kl": 6.78125, - "learning_rate": 3.259292040412711e-07, - "loss": 0.4029, - "num_tokens": 1114527936.0, - "reward": 1.72021484375, - "reward_std": 0.5575881600379944, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.79296875, - "rewards/format_reward/std": 0.40557438135147095, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.21390387415885925, + "grad_norm": 2.3430943489074707, + "kl": 2.51171875, + "learning_rate": 3.260846068334218e-07, + "loss": 0.134, + "num_tokens": 1151602243.0, + "reward": 1.0439453125, + "reward_std": 0.288002073764801, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.1709705889225006, "step": 2049 }, { @@ -59436,27 +59436,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1942.0, - "completions/mean_length": 769.087890625, - "completions/mean_terminated_length": 743.611572265625, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 864.6484375, + "completions/mean_terminated_length": 848.2455444335938, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.6998378424511393, - "grad_norm": 1.2725328207015991, - "kl": 5.4140625, - "learning_rate": 3.2546444267705786e-07, - "loss": 0.348, - "num_tokens": 1114997725.0, - "reward": 1.86083984375, - "reward_std": 0.5448645353317261, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.18110673129558563, + "grad_norm": 3.197082757949829, + "kl": 2.87890625, + "learning_rate": 3.256195628120387e-07, + "loss": 0.1317, + "num_tokens": 1152120959.0, + "reward": 1.107421875, + "reward_std": 0.35819485783576965, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.19534705579280853, "step": 2050 }, { @@ -59465,27 +59465,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 876.27734375, - "completions/mean_terminated_length": 813.5925903320312, - "completions/min_length": 27.0, - "completions/min_terminated_length": 27.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 936.4140625, + "completions/mean_terminated_length": 914.2709350585938, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.7001792267645301, - "grad_norm": 0.8814513087272644, - "kl": 7.859375, - "learning_rate": 3.250000000000001e-07, - "loss": 0.4884, - "num_tokens": 1115519995.0, - "reward": 1.79833984375, - "reward_std": 0.6143364906311035, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.21277517080307007, + "grad_norm": 2.7993712425231934, + "kl": 1.865234375, + "learning_rate": 3.2515483749932136e-07, + "loss": 0.1201, + "num_tokens": 1152674019.0, + "reward": 1.07861328125, + "reward_std": 0.3104711174964905, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.1793731451034546, "step": 2051 }, { @@ -59494,27 +59494,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 873.76953125, - "completions/mean_terminated_length": 803.26708984375, - "completions/min_length": 39.0, - "completions/min_terminated_length": 39.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 930.775390625, + "completions/mean_terminated_length": 917.5277099609375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.700520611077921, - "grad_norm": 2.498657464981079, - "kl": 8.8671875, - "learning_rate": 3.245358766692891e-07, - "loss": 0.5218, - "num_tokens": 1116043653.0, - "reward": 1.7578125, - "reward_std": 0.5840620398521423, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.21615298092365265, + "grad_norm": 2.217512845993042, + "kl": 2.126953125, + "learning_rate": 3.2469043155536266e-07, + "loss": 0.1157, + "num_tokens": 1153226864.0, + "reward": 1.04296875, + "reward_std": 0.3125886023044586, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.1816289722919464, "step": 2052 }, { @@ -59523,27 +59523,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1912.0, - "completions/mean_length": 889.03125, - "completions/mean_terminated_length": 824.5113525390625, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 997.1484375, + "completions/mean_terminated_length": 961.0586547851562, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.7008619953913118, - "grad_norm": 3.3309242725372314, - "kl": 7.984375, - "learning_rate": 3.2407207334366347e-07, - "loss": 0.4576, - "num_tokens": 1116575157.0, - "reward": 1.77685546875, - "reward_std": 0.6070467829704285, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.90576171875, - "rewards/tag_count_reward/std": 0.2173515260219574, + "grad_norm": 1.6279301643371582, + "kl": 3.1015625, + "learning_rate": 3.242263456398022e-07, + "loss": 0.1879, + "num_tokens": 1153813724.0, + "reward": 1.0234375, + "reward_std": 0.36192587018013, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.896484375, + "rewards/tag_count_reward/std": 0.21106232702732086, "step": 2053 }, { @@ -59552,27 +59552,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 861.42578125, - "completions/mean_terminated_length": 797.9464721679688, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 900.798828125, + "completions/mean_terminated_length": 882.58935546875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.7012033797047026, - "grad_norm": 1.4844022989273071, - "kl": 7.8515625, - "learning_rate": 3.2360859068140666e-07, - "loss": 0.4569, - "num_tokens": 1117093103.0, - "reward": 1.75830078125, - "reward_std": 0.582671046257019, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.21437686681747437, + "grad_norm": 3.0876924991607666, + "kl": 2.564453125, + "learning_rate": 3.237625804118249e-07, + "loss": 0.1628, + "num_tokens": 1154351829.0, + "reward": 1.03369140625, + "reward_std": 0.3081708550453186, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15143637359142303, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.1820801943540573, "step": 2054 }, { @@ -59581,27 +59581,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1938.0, - "completions/mean_length": 811.24609375, - "completions/mean_terminated_length": 773.9194946289062, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 913.6484375, + "completions/mean_terminated_length": 904.716552734375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, "epoch": 0.7015447640180934, - "grad_norm": 0.7517781853675842, - "kl": 5.75, - "learning_rate": 3.2314542934034813e-07, - "loss": 0.3329, - "num_tokens": 1117587165.0, - "reward": 1.85107421875, - "reward_std": 0.5656074285507202, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.19561529159545898, + "grad_norm": 2.9013869762420654, + "kl": 1.642578125, + "learning_rate": 3.232991365301604e-07, + "loss": 0.0972, + "num_tokens": 1154898321.0, + "reward": 1.08935546875, + "reward_std": 0.2962498664855957, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.95458984375, + "rewards/tag_count_reward/std": 0.14508728682994843, "step": 2055 }, { @@ -59610,27 +59610,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 798.5546875, - "completions/mean_terminated_length": 763.4296875, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 869.607421875, + "completions/mean_terminated_length": 867.3013916015625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.7018861483314842, - "grad_norm": 1.4017919301986694, - "kl": 5.7265625, - "learning_rate": 3.2268258997786015e-07, - "loss": 0.3736, - "num_tokens": 1118067785.0, - "reward": 1.904296875, - "reward_std": 0.5260640382766724, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.18633443117141724, + "grad_norm": 1.661656141281128, + "kl": 1.658203125, + "learning_rate": 3.2283601465308135e-07, + "loss": 0.0571, + "num_tokens": 1155415320.0, + "reward": 1.125, + "reward_std": 0.3145996332168579, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.17416280508041382, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.16467617452144623, "step": 2056 }, { @@ -59639,27 +59639,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 862.255859375, - "completions/mean_terminated_length": 803.9405517578125, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 936.619140625, + "completions/mean_terminated_length": 927.8681030273438, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, "epoch": 0.7022275326448749, - "grad_norm": 2.1168081760406494, - "kl": 5.796875, - "learning_rate": 3.2222007325085885e-07, - "loss": 0.3847, - "num_tokens": 1118585004.0, - "reward": 1.80859375, - "reward_std": 0.5263746976852417, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.2045232206583023, + "grad_norm": 4.517663955688477, + "kl": 1.8046875, + "learning_rate": 3.2237321543840367e-07, + "loss": 0.0484, + "num_tokens": 1155970613.0, + "reward": 1.05224609375, + "reward_std": 0.30492764711380005, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.94482421875, + "rewards/tag_count_reward/std": 0.1572064310312271, "step": 2057 }, { @@ -59668,27 +59668,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 765.3828125, - "completions/mean_terminated_length": 726.6719970703125, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 844.6328125, + "completions/mean_terminated_length": 820.661376953125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.7025689169582657, - "grad_norm": 0.7896247506141663, - "kl": 5.66796875, - "learning_rate": 3.217578798158022e-07, - "loss": 0.3404, - "num_tokens": 1119057200.0, - "reward": 1.88037109375, - "reward_std": 0.5610973834991455, - "rewards/accuracy_reward/mean": 0.1088709682226181, - "rewards/accuracy_reward/std": 0.3117917478084564, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.18635430932044983, + "grad_norm": 1.5827946662902832, + "kl": 2.2021484375, + "learning_rate": 3.219107395434843e-07, + "loss": 0.142, + "num_tokens": 1156483385.0, + "reward": 1.15625, + "reward_std": 0.36130523681640625, + "rewards/accuracy_reward/mean": 0.17540322244167328, + "rewards/accuracy_reward/std": 0.3806955814361572, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.17115212976932526, "step": 2058 }, { @@ -59697,27 +59697,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1924.0, - "completions/mean_length": 772.09375, - "completions/mean_terminated_length": 738.8536987304688, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 872.416015625, + "completions/mean_terminated_length": 865.4872436523438, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.7029103012716565, - "grad_norm": 1.0140058994293213, - "kl": 5.375, - "learning_rate": 3.2129601032868884e-07, - "loss": 0.326, - "num_tokens": 1119525584.0, - "reward": 1.93310546875, - "reward_std": 0.5191199779510498, - "rewards/accuracy_reward/mean": 0.134765625, - "rewards/accuracy_reward/std": 0.3418070077896118, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.17916527390480042, + "grad_norm": 2.5658154487609863, + "kl": 3.001953125, + "learning_rate": 3.2144858762522156e-07, + "loss": 0.1707, + "num_tokens": 1157003134.0, + "reward": 1.130859375, + "reward_std": 0.35006314516067505, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.1847999542951584, "step": 2059 }, { @@ -59726,27 +59726,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1936.0, - "completions/mean_length": 865.33984375, - "completions/mean_terminated_length": 817.2642211914062, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 939.365234375, + "completions/mean_terminated_length": 935.0177001953125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.7032516855850474, - "grad_norm": 1.2857820987701416, - "kl": 5.6796875, - "learning_rate": 3.2083446544505847e-07, - "loss": 0.3531, - "num_tokens": 1120048190.0, - "reward": 1.888671875, - "reward_std": 0.5044642686843872, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.19510231912136078, + "grad_norm": 5.632010459899902, + "kl": 2.83984375, + "learning_rate": 3.2098676034005325e-07, + "loss": 0.1229, + "num_tokens": 1157563641.0, + "reward": 1.111328125, + "reward_std": 0.3705633878707886, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.18991030752658844, "step": 2060 }, { @@ -59755,27 +59755,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1912.0, - "completions/mean_length": 844.173828125, - "completions/mean_terminated_length": 805.3406982421875, - "completions/min_length": 87.0, - "completions/min_terminated_length": 87.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 908.017578125, + "completions/mean_terminated_length": 889.9226684570312, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.7035930698984382, - "grad_norm": 1.0574839115142822, - "kl": 5.9765625, - "learning_rate": 3.203732458199893e-07, - "loss": 0.3572, - "num_tokens": 1120570471.0, - "reward": 1.828125, - "reward_std": 0.5073466300964355, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.1669810563325882, + "grad_norm": 7.497401237487793, + "kl": 2.42578125, + "learning_rate": 3.205252583439564e-07, + "loss": 0.0608, + "num_tokens": 1158118610.0, + "reward": 1.0849609375, + "reward_std": 0.3396851718425751, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.15904124081134796, "step": 2061 }, { @@ -59784,27 +59784,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 777.623046875, - "completions/mean_terminated_length": 736.6431274414062, - "completions/min_length": 15.0, - "completions/min_terminated_length": 15.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 865.884765625, + "completions/mean_terminated_length": 847.12109375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.703934454211829, - "grad_norm": 1.050209641456604, - "kl": 5.734375, - "learning_rate": 3.199123521080985e-07, - "loss": 0.3658, - "num_tokens": 1121042230.0, - "reward": 1.90234375, - "reward_std": 0.4538320302963257, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.18037252128124237, + "grad_norm": 3.1223888397216797, + "kl": 2.669921875, + "learning_rate": 3.200640822924453e-07, + "loss": 0.1456, + "num_tokens": 1158635559.0, + "reward": 1.09228515625, + "reward_std": 0.31413334608078003, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.17327652871608734, "step": 2062 }, { @@ -59813,27 +59813,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 840.525390625, - "completions/mean_terminated_length": 816.4721069335938, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 923.763671875, + "completions/mean_terminated_length": 917.1375732421875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, "epoch": 0.7042758385252198, - "grad_norm": 1.8714277744293213, - "kl": 6.3515625, - "learning_rate": 3.194517849635404e-07, - "loss": 0.3562, - "num_tokens": 1121552643.0, - "reward": 1.85205078125, - "reward_std": 0.5359213948249817, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.17249169945716858, + "grad_norm": 1.7292968034744263, + "kl": 1.890625, + "learning_rate": 3.1960323284057226e-07, + "loss": 0.0822, + "num_tokens": 1159188590.0, + "reward": 1.07275390625, + "reward_std": 0.330628901720047, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.17465519905090332, "step": 2063 }, { @@ -59842,27 +59842,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1841.0, - "completions/mean_length": 877.93359375, - "completions/mean_terminated_length": 825.3999633789062, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 931.328125, + "completions/mean_terminated_length": 918.0869750976562, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, "epoch": 0.7046172228386106, - "grad_norm": 2.246628761291504, - "kl": 6.2265625, - "learning_rate": 3.1899154504000544e-07, - "loss": 0.3539, - "num_tokens": 1122087329.0, - "reward": 1.81689453125, - "reward_std": 0.5580817461013794, - "rewards/accuracy_reward/mean": 0.05645161122083664, - "rewards/accuracy_reward/std": 0.23102475702762604, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.19771909713745117, + "grad_norm": 4.040060043334961, + "kl": 2.953125, + "learning_rate": 3.1914271064292476e-07, + "loss": 0.1694, + "num_tokens": 1159750614.0, + "reward": 1.0517578125, + "reward_std": 0.347946435213089, + "rewards/accuracy_reward/mean": 0.09879032522439957, + "rewards/accuracy_reward/std": 0.2986815273761749, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.1926850527524948, "step": 2064 }, { @@ -59871,27 +59871,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 822.955078125, - "completions/mean_terminated_length": 788.5160522460938, - "completions/min_length": 33.0, - "completions/min_terminated_length": 33.0, + "completions/max_terminated_length": 1888.0, + "completions/mean_length": 913.443359375, + "completions/mean_terminated_length": 897.7168579101562, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.7049586071520013, - "grad_norm": 1.6371833086013794, - "kl": 6.3984375, - "learning_rate": 3.185316329907204e-07, - "loss": 0.4093, - "num_tokens": 1122584394.0, - "reward": 1.89794921875, - "reward_std": 0.5305365324020386, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.18909280002117157, + "grad_norm": 2.0488128662109375, + "kl": 2.6953125, + "learning_rate": 3.1868251635362584e-07, + "loss": 0.1273, + "num_tokens": 1160294009.0, + "reward": 1.16259765625, + "reward_std": 0.3914956748485565, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.1739809662103653, "step": 2065 }, { @@ -59900,27 +59900,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1958.0, - "completions/mean_length": 839.705078125, - "completions/mean_terminated_length": 795.6781616210938, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 935.466796875, + "completions/mean_terminated_length": 913.3048095703125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.7052999914653921, - "grad_norm": 1.2888261079788208, - "kl": 5.4921875, - "learning_rate": 3.1807204946844613e-07, - "loss": 0.355, - "num_tokens": 1123093411.0, - "reward": 1.888671875, - "reward_std": 0.4445631802082062, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.1719430834054947, + "grad_norm": 1.882514238357544, + "kl": 2.326171875, + "learning_rate": 3.1822265062633304e-07, + "loss": 0.1318, + "num_tokens": 1160852056.0, + "reward": 1.0576171875, + "reward_std": 0.3285204768180847, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.1833334118127823, "step": 2066 }, { @@ -59929,27 +59929,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 827.259765625, - "completions/mean_terminated_length": 795.4569091796875, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 951.78125, + "completions/mean_terminated_length": 934.3809814453125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, "epoch": 0.7056413757787829, - "grad_norm": 1.8677408695220947, - "kl": 3.96875, - "learning_rate": 3.176127951254775e-07, - "loss": 0.2618, - "num_tokens": 1123593800.0, - "reward": 1.890625, - "reward_std": 0.4381256699562073, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.90234375, - "rewards/format_reward/std": 0.29713961482048035, - "rewards/tag_count_reward/mean": 0.947265625, - "rewards/tag_count_reward/std": 0.1659708470106125, + "grad_norm": 3.051801919937134, + "kl": 2.5703125, + "learning_rate": 3.1776311411423687e-07, + "loss": 0.1584, + "num_tokens": 1161416200.0, + "reward": 1.03857421875, + "reward_std": 0.32101166248321533, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.17502138018608093, "step": 2067 }, { @@ -59958,27 +59958,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 878.208984375, - "completions/mean_terminated_length": 825.687744140625, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 956.146484375, + "completions/mean_terminated_length": 941.0119018554688, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, "epoch": 0.7059827600921738, - "grad_norm": 1.1508092880249023, - "kl": 5.953125, - "learning_rate": 3.1715387061364187e-07, - "loss": 0.3571, - "num_tokens": 1124124851.0, - "reward": 1.83447265625, - "reward_std": 0.5666013956069946, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.20063117146492004, + "grad_norm": 5.0430474281311035, + "kl": 2.25390625, + "learning_rate": 3.173039074700602e-07, + "loss": 0.0733, + "num_tokens": 1161987155.0, + "reward": 1.076171875, + "reward_std": 0.3384384512901306, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.19010141491889954, "step": 2068 }, { @@ -59987,27 +59987,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 855.765625, - "completions/mean_terminated_length": 804.7739868164062, - "completions/min_length": 206.0, - "completions/min_terminated_length": 206.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 945.33203125, + "completions/mean_terminated_length": 927.8294067382812, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, "epoch": 0.7063241444055646, - "grad_norm": 0.7897908687591553, - "kl": 5.796875, - "learning_rate": 3.1669527658429914e-07, - "loss": 0.3758, - "num_tokens": 1124644555.0, - "reward": 1.83154296875, - "reward_std": 0.5065468549728394, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19015607237815857, + "grad_norm": 3.8846516609191895, + "kl": 2.375, + "learning_rate": 3.168450313460577e-07, + "loss": 0.0917, + "num_tokens": 1162552717.0, + "reward": 1.05224609375, + "reward_std": 0.3390119671821594, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.17906923592090607, "step": 2069 }, { @@ -60016,27 +60016,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1909.0, - "completions/mean_length": 840.5, - "completions/mean_terminated_length": 823.7623901367188, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 906.74609375, + "completions/mean_terminated_length": 900.0196533203125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.7066655287189554, - "grad_norm": 1.5946892499923706, - "kl": 3.60546875, - "learning_rate": 3.162370136883389e-07, - "loss": 0.2418, - "num_tokens": 1125148203.0, - "reward": 1.9111328125, - "reward_std": 0.43999338150024414, - "rewards/accuracy_reward/mean": 0.060483869165182114, - "rewards/accuracy_reward/std": 0.2386218160390854, - "rewards/format_reward/mean": 0.90234375, - "rewards/format_reward/std": 0.29713961482048035, - "rewards/tag_count_reward/mean": 0.9501953125, - "rewards/tag_count_reward/std": 0.1593773365020752, + "grad_norm": 8.674768447875977, + "kl": 2.0703125, + "learning_rate": 3.16386486394014e-07, + "loss": 0.0289, + "num_tokens": 1163090283.0, + "reward": 1.08642578125, + "reward_std": 0.3626440763473511, + "rewards/accuracy_reward/mean": 0.11088709533214569, + "rewards/accuracy_reward/std": 0.3143092691898346, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.1780041754245758, "step": 2070 }, { @@ -60045,27 +60045,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1939.0, - "completions/mean_length": 905.765625, - "completions/mean_terminated_length": 856.9124755859375, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 970.03515625, + "completions/mean_terminated_length": 959.4043579101562, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.7070069130323462, - "grad_norm": 1.0739637613296509, - "kl": 5.09765625, - "learning_rate": 3.157790825761818e-07, - "loss": 0.3311, - "num_tokens": 1125693235.0, - "reward": 1.8330078125, - "reward_std": 0.5031325817108154, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.9287109375, - "rewards/tag_count_reward/std": 0.18649590015411377, + "grad_norm": 1.7121455669403076, + "kl": 1.69140625, + "learning_rate": 3.1592827326524395e-07, + "loss": 0.0695, + "num_tokens": 1163668221.0, + "reward": 1.07373046875, + "reward_std": 0.33283674716949463, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.17930388450622559, "step": 2071 }, { @@ -60074,27 +60074,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1906.0, - "completions/mean_length": 867.619140625, - "completions/mean_terminated_length": 834.4357299804688, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 942.34765625, + "completions/mean_terminated_length": 929.2371826171875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.707348297345737, - "grad_norm": 1.3727738857269287, - "kl": 5.53125, - "learning_rate": 3.1532148389777766e-07, - "loss": 0.3497, - "num_tokens": 1126212560.0, - "reward": 1.8515625, - "reward_std": 0.49354204535484314, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, + "grad_norm": 1.2957576513290405, + "kl": 1.890625, + "learning_rate": 3.154703926105907e-07, + "loss": 0.1004, + "num_tokens": 1164225807.0, + "reward": 1.037109375, + "reward_std": 0.3216906487941742, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.1783159226179123, + "rewards/tag_count_reward/std": 0.17131954431533813, "step": 2072 }, { @@ -60103,27 +60103,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1906.0, - "completions/mean_length": 800.267578125, - "completions/mean_terminated_length": 744.2468872070312, - "completions/min_length": 7.0, - "completions/min_terminated_length": 7.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 907.765625, + "completions/mean_terminated_length": 885.0518188476562, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.7076896816591277, - "grad_norm": 0.9355970025062561, - "kl": 5.55859375, - "learning_rate": 3.148642183026037e-07, - "loss": 0.3709, - "num_tokens": 1126692553.0, - "reward": 1.90087890625, - "reward_std": 0.4898828864097595, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.93798828125, - "rewards/tag_count_reward/std": 0.17746655642986298, + "grad_norm": 6.42418098449707, + "kl": 2.2880859375, + "learning_rate": 3.1501284508042536e-07, + "loss": 0.0777, + "num_tokens": 1164760839.0, + "reward": 1.10498046875, + "reward_std": 0.3907593786716461, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.1749831587076187, "step": 2073 }, { @@ -60132,27 +60132,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 878.673828125, - "completions/mean_terminated_length": 813.5773315429688, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1012.328125, + "completions/mean_terminated_length": 991.697265625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.7080310659725185, - "grad_norm": 0.9593584537506104, - "kl": 7.15625, - "learning_rate": 3.14407286439665e-07, - "loss": 0.4826, - "num_tokens": 1127221202.0, - "reward": 1.880859375, - "reward_std": 0.5013091564178467, - "rewards/accuracy_reward/mean": 0.07459677755832672, - "rewards/accuracy_reward/std": 0.263004869222641, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.19400213658809662, + "grad_norm": 1.9056379795074463, + "kl": 2.9296875, + "learning_rate": 3.1455563132464567e-07, + "loss": 0.1495, + "num_tokens": 1165357919.0, + "reward": 1.02490234375, + "reward_std": 0.3452129364013672, + "rewards/accuracy_reward/mean": 0.08669354766607285, + "rewards/accuracy_reward/std": 0.281669557094574, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.90185546875, + "rewards/tag_count_reward/std": 0.2081073820590973, "step": 2074 }, { @@ -60161,27 +60161,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1947.0, - "completions/mean_length": 815.337890625, - "completions/mean_terminated_length": 757.3599243164062, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 906.298828125, + "completions/mean_terminated_length": 878.8980712890625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.7083724502859093, - "grad_norm": 3.248323917388916, - "kl": 8.171875, - "learning_rate": 3.1395068895749275e-07, - "loss": 0.4901, - "num_tokens": 1127716383.0, - "reward": 1.89990234375, - "reward_std": 0.6105526089668274, - "rewards/accuracy_reward/mean": 0.138671875, - "rewards/accuracy_reward/std": 0.34594178199768066, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20941190421581268, + "grad_norm": 2.7555909156799316, + "kl": 2.27734375, + "learning_rate": 3.1409875199267556e-07, + "loss": 0.1402, + "num_tokens": 1165899672.0, + "reward": 1.18798828125, + "reward_std": 0.4005546569824219, + "rewards/accuracy_reward/mean": 0.205078125, + "rewards/accuracy_reward/std": 0.4041535556316376, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.18214841187000275, "step": 2075 }, { @@ -60190,27 +60190,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 888.046875, - "completions/mean_terminated_length": 825.9917602539062, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 1001.12109375, + "completions/mean_terminated_length": 980.2669677734375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.7087138345993002, - "grad_norm": 2.321079969406128, - "kl": 8.3671875, - "learning_rate": 3.134944265041436e-07, - "loss": 0.5019, - "num_tokens": 1128253431.0, - "reward": 1.79833984375, - "reward_std": 0.5561305284500122, - "rewards/accuracy_reward/mean": 0.0463709682226181, - "rewards/accuracy_reward/std": 0.21049949526786804, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.20428889989852905, + "grad_norm": 2.6930439472198486, + "kl": 2.21484375, + "learning_rate": 3.1364220773346346e-07, + "loss": 0.1197, + "num_tokens": 1166494614.0, + "reward": 1.0400390625, + "reward_std": 0.3177770972251892, + "rewards/accuracy_reward/mean": 0.060483869165182114, + "rewards/accuracy_reward/std": 0.2386218160390854, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.18428993225097656, "step": 2076 }, { @@ -60219,27 +60219,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 880.76953125, - "completions/mean_terminated_length": 838.2388916015625, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 953.2890625, + "completions/mean_terminated_length": 938.1148681640625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.709055218912691, - "grad_norm": 2.276273012161255, - "kl": 6.71875, - "learning_rate": 3.1303849972719834e-07, - "loss": 0.4276, - "num_tokens": 1128781393.0, - "reward": 1.86083984375, - "reward_std": 0.5237646102905273, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.1869123876094818, + "grad_norm": 4.2263875007629395, + "kl": 1.716796875, + "learning_rate": 3.1318599919548235e-07, + "loss": 0.091, + "num_tokens": 1167059706.0, + "reward": 1.0732421875, + "reward_std": 0.29490160942077637, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.16953378915786743, "step": 2077 }, { @@ -60248,27 +60248,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1934.0, - "completions/mean_length": 776.658203125, - "completions/mean_terminated_length": 740.9176635742188, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 866.671875, + "completions/mean_terminated_length": 852.6640625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.7093966032260818, - "grad_norm": 1.743714690208435, - "kl": 6.734375, - "learning_rate": 3.1258290927376187e-07, - "loss": 0.4157, - "num_tokens": 1129250066.0, - "reward": 1.88232421875, - "reward_std": 0.5077085494995117, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.18912817537784576, + "grad_norm": 3.0302419662475586, + "kl": 2.55078125, + "learning_rate": 3.127301270267282e-07, + "loss": 0.1397, + "num_tokens": 1167574466.0, + "reward": 1.0859375, + "reward_std": 0.3346711993217468, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.17828376591205597, "step": 2078 }, { @@ -60277,27 +60277,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1846.0, - "completions/mean_length": 836.994140625, - "completions/mean_terminated_length": 787.7662353515625, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 891.16796875, + "completions/mean_terminated_length": 879.7593994140625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.7097379875394726, - "grad_norm": 1.4613759517669678, - "kl": 8.015625, - "learning_rate": 3.121276557904616e-07, - "loss": 0.4949, - "num_tokens": 1129758271.0, - "reward": 1.8251953125, - "reward_std": 0.5580568313598633, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.20534151792526245, + "grad_norm": 2.0054116249084473, + "kl": 2.01953125, + "learning_rate": 3.122745918747193e-07, + "loss": 0.083, + "num_tokens": 1168110408.0, + "reward": 1.08203125, + "reward_std": 0.3111875653266907, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.17896848917007446, "step": 2079 }, { @@ -60306,27 +60306,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 941.126953125, - "completions/mean_terminated_length": 864.87060546875, - "completions/min_length": 228.0, - "completions/min_terminated_length": 228.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1024.44140625, + "completions/mean_terminated_length": 993.5492553710938, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, "epoch": 0.7100793718528634, - "grad_norm": 1.0365214347839355, - "kl": 7.7265625, - "learning_rate": 3.1167273992344646e-07, - "loss": 0.5085, - "num_tokens": 1130317376.0, - "reward": 1.75830078125, - "reward_std": 0.5934568643569946, - "rewards/accuracy_reward/mean": 0.04435483738780022, - "rewards/accuracy_reward/std": 0.2060900777578354, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.89306640625, - "rewards/tag_count_reward/std": 0.23650622367858887, + "grad_norm": 2.6261651515960693, + "kl": 2.732421875, + "learning_rate": 3.1181939438649485e-07, + "loss": 0.1198, + "num_tokens": 1168712170.0, + "reward": 1.0224609375, + "reward_std": 0.33199214935302734, + "rewards/accuracy_reward/mean": 0.0786290317773819, + "rewards/accuracy_reward/std": 0.26943066716194153, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.20196326076984406, "step": 2080 }, { @@ -60335,27 +60335,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 823.771484375, - "completions/mean_terminated_length": 794.3900146484375, - "completions/min_length": 20.0, - "completions/min_terminated_length": 20.0, + "completions/max_terminated_length": 1845.0, + "completions/mean_length": 972.748046875, + "completions/mean_terminated_length": 962.1439819335938, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, "epoch": 0.7104207561662541, - "grad_norm": 0.7497561573982239, - "kl": 5.75, - "learning_rate": 3.112181623183866e-07, - "loss": 0.3274, - "num_tokens": 1130817611.0, - "reward": 1.87353515625, - "reward_std": 0.5437583923339844, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.19833672046661377, + "grad_norm": 1.6980721950531006, + "kl": 2.19921875, + "learning_rate": 3.1136453520861494e-07, + "loss": 0.081, + "num_tokens": 1169288681.0, + "reward": 1.08349609375, + "reward_std": 0.335152268409729, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.17210347950458527, "step": 2081 }, { @@ -60364,27 +60364,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 875.755859375, - "completions/mean_terminated_length": 828.1036376953125, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 961.078125, + "completions/mean_terminated_length": 943.825439453125, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, "epoch": 0.7107621404796449, - "grad_norm": 2.05602765083313, - "kl": 5.46875, - "learning_rate": 3.1076392362047117e-07, - "loss": 0.3592, - "num_tokens": 1131342606.0, - "reward": 1.8037109375, - "reward_std": 0.4788172245025635, - "rewards/accuracy_reward/mean": 0.01953125, - "rewards/accuracy_reward/std": 0.1385180652141571, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.9287109375, - "rewards/tag_count_reward/std": 0.18845312297344208, + "grad_norm": 3.2408053874969482, + "kl": 2.30859375, + "learning_rate": 3.1091001498715874e-07, + "loss": 0.0639, + "num_tokens": 1169857361.0, + "reward": 1.01171875, + "reward_std": 0.3441643714904785, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.2005888968706131, "step": 2082 }, { @@ -60393,27 +60393,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1943.0, - "completions/mean_length": 845.427734375, - "completions/mean_terminated_length": 786.2847900390625, - "completions/min_length": 50.0, - "completions/min_terminated_length": 50.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 928.5625, + "completions/mean_terminated_length": 913.0455932617188, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.7111035247930357, - "grad_norm": 0.9748495221138, - "kl": 6.421875, - "learning_rate": 3.1031002447440945e-07, - "loss": 0.4114, - "num_tokens": 1131848105.0, - "reward": 1.87890625, - "reward_std": 0.5175662040710449, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.18894177675247192, + "grad_norm": 2.4408140182495117, + "kl": 2.0078125, + "learning_rate": 3.104558343677242e-07, + "loss": 0.0759, + "num_tokens": 1170405425.0, + "reward": 1.05859375, + "reward_std": 0.29011622071266174, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.1816289722919464, "step": 2083 }, { @@ -60422,27 +60422,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1921.0, - "completions/mean_length": 829.20703125, - "completions/mean_terminated_length": 774.4856567382812, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 905.404296875, + "completions/mean_terminated_length": 896.407470703125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, "epoch": 0.7114449091064265, - "grad_norm": 1.3156884908676147, - "kl": 7.265625, - "learning_rate": 3.0985646552442794e-07, - "loss": 0.4885, - "num_tokens": 1132342147.0, - "reward": 1.7783203125, - "reward_std": 0.5539376139640808, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.9111328125, - "rewards/tag_count_reward/std": 0.2142503559589386, + "grad_norm": 3.700108766555786, + "kl": 1.740234375, + "learning_rate": 3.100019939954267e-07, + "loss": 0.0588, + "num_tokens": 1170938480.0, + "reward": 1.07763671875, + "reward_std": 0.35653746128082275, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.1736346036195755, "step": 2084 }, { @@ -60451,27 +60451,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 945.25, - "completions/mean_terminated_length": 869.2777099609375, - "completions/min_length": 231.0, - "completions/min_terminated_length": 231.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1014.556640625, + "completions/mean_terminated_length": 1000.231689453125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, "epoch": 0.7117862934198174, - "grad_norm": 2.0443739891052246, - "kl": 7.4609375, - "learning_rate": 3.0940324741427103e-07, - "loss": 0.5197, - "num_tokens": 1132902563.0, - "reward": 1.7919921875, - "reward_std": 0.5862941145896912, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.21908392012119293, + "grad_norm": 1.6824538707733154, + "kl": 1.857421875, + "learning_rate": 3.0954849451489884e-07, + "loss": 0.0792, + "num_tokens": 1171534381.0, + "reward": 1.08056640625, + "reward_std": 0.3470780849456787, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.1794423907995224, "step": 2085 }, { @@ -60480,27 +60480,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 928.03515625, - "completions/mean_terminated_length": 863.2437744140625, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 947.73828125, + "completions/mean_terminated_length": 934.6917114257812, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.7121276777332082, - "grad_norm": 1.4917434453964233, - "kl": 6.5859375, - "learning_rate": 3.089503707871983e-07, - "loss": 0.3996, - "num_tokens": 1133452213.0, - "reward": 1.78466796875, - "reward_std": 0.5361355543136597, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20647093653678894, + "grad_norm": 2.645061492919922, + "kl": 1.3857421875, + "learning_rate": 3.090953365702882e-07, + "loss": 0.0476, + "num_tokens": 1172094119.0, + "reward": 1.02392578125, + "reward_std": 0.3295062482357025, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.2029850035905838, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.1874433010816574, "step": 2086 }, { @@ -60509,27 +60509,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 841.89453125, - "completions/mean_terminated_length": 790.3096313476562, - "completions/min_length": 49.0, - "completions/min_terminated_length": 49.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 990.681640625, + "completions/mean_terminated_length": 965.3060302734375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, "epoch": 0.712469062046599, - "grad_norm": 1.0421783924102783, - "kl": 6.37890625, - "learning_rate": 3.084978362859859e-07, - "loss": 0.3985, - "num_tokens": 1133965023.0, - "reward": 1.830078125, - "reward_std": 0.5412536263465881, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.20316380262374878, + "grad_norm": 1.8598291873931885, + "kl": 2.478515625, + "learning_rate": 3.08642520805258e-07, + "loss": 0.1302, + "num_tokens": 1172683108.0, + "reward": 1.07470703125, + "reward_std": 0.36600035429000854, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.19551268219947815, "step": 2087 }, { @@ -60538,27 +60538,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1933.0, - "completions/mean_length": 919.078125, - "completions/mean_terminated_length": 843.8167114257812, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1041.626953125, + "completions/mean_terminated_length": 1011.2534790039062, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.7128104463599898, - "grad_norm": 1.837857723236084, - "kl": 8.21875, - "learning_rate": 3.080456445529237e-07, - "loss": 0.5007, - "num_tokens": 1134518903.0, - "reward": 1.783203125, - "reward_std": 0.6176773309707642, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.90234375, - "rewards/tag_count_reward/std": 0.21948698163032532, + "grad_norm": 2.8653650283813477, + "kl": 1.85546875, + "learning_rate": 3.081900478629848e-07, + "loss": 0.0801, + "num_tokens": 1173299733.0, + "reward": 1.04296875, + "reward_std": 0.3478839099407196, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.1977970004081726, "step": 2088 }, { @@ -60567,27 +60567,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 805.67578125, - "completions/mean_terminated_length": 752.5418090820312, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 923.197265625, + "completions/mean_terminated_length": 891.5762939453125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.7131518306733805, - "grad_norm": 1.8352817296981812, - "kl": 6.84375, - "learning_rate": 3.075937962298147e-07, - "loss": 0.3996, - "num_tokens": 1135008753.0, - "reward": 1.83154296875, - "reward_std": 0.5388531684875488, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.2074405401945114, + "grad_norm": 10.932439804077148, + "kl": 2.1171875, + "learning_rate": 3.077379183861587e-07, + "loss": 0.1544, + "num_tokens": 1173849754.0, + "reward": 1.0625, + "reward_std": 0.3302322328090668, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.19534705579280853, "step": 2089 }, { @@ -60596,27 +60596,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1961.0, - "completions/mean_length": 829.541015625, - "completions/mean_terminated_length": 802.7884521484375, - "completions/min_length": 196.0, - "completions/min_terminated_length": 196.0, + "completions/max_terminated_length": 1915.0, + "completions/mean_length": 932.109375, + "completions/mean_terminated_length": 916.6416015625, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.7134932149867713, - "grad_norm": 1.3277990818023682, - "kl": 5.046875, - "learning_rate": 3.0714229195797545e-07, - "loss": 0.3146, - "num_tokens": 1135506326.0, - "reward": 1.9111328125, - "reward_std": 0.4676492214202881, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.9443359375, - "rewards/tag_count_reward/std": 0.16648533940315247, + "grad_norm": 2.4517128467559814, + "kl": 1.623046875, + "learning_rate": 3.072861330169818e-07, + "loss": 0.0598, + "num_tokens": 1174399842.0, + "reward": 1.0400390625, + "reward_std": 0.26263684034347534, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.1658182144165039, "step": 2090 }, { @@ -60625,27 +60625,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 790.603515625, - "completions/mean_terminated_length": 744.7874755859375, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 882.740234375, + "completions/mean_terminated_length": 859.5278930664062, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, "epoch": 0.7138345993001621, - "grad_norm": 1.3557393550872803, - "kl": 6.1640625, - "learning_rate": 3.066911323782333e-07, - "loss": 0.4, - "num_tokens": 1135986123.0, - "reward": 1.9697265625, - "reward_std": 0.562901496887207, - "rewards/accuracy_reward/mean": 0.16129031777381897, - "rewards/accuracy_reward/std": 0.3681698739528656, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.18466287851333618, + "grad_norm": 3.553374767303467, + "kl": 2.40234375, + "learning_rate": 3.0683469239716753e-07, + "loss": 0.159, + "num_tokens": 1174926813.0, + "reward": 1.16015625, + "reward_std": 0.3651435971260071, + "rewards/accuracy_reward/mean": 0.19758065044879913, + "rewards/accuracy_reward/std": 0.398576021194458, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.17621363699436188, "step": 2091 }, { @@ -60654,27 +60654,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1888.0, - "completions/mean_length": 818.677734375, - "completions/mean_terminated_length": 776.4586181640625, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 970.98828125, + "completions/mean_terminated_length": 940.7108154296875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.714175983613553, - "grad_norm": 2.0661044120788574, - "kl": 5.5703125, - "learning_rate": 3.062403181309271e-07, - "loss": 0.3337, - "num_tokens": 1136485350.0, - "reward": 1.84130859375, - "reward_std": 0.5803340673446655, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, + "grad_norm": 7.568809986114502, + "kl": 2.634765625, + "learning_rate": 3.0638359716793926e-07, + "loss": 0.1993, + "num_tokens": 1175504023.0, + "reward": 1.06591796875, + "reward_std": 0.30761605501174927, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.18809975683689117, + "rewards/tag_count_reward/std": 0.18874886631965637, "step": 2092 }, { @@ -60683,27 +60683,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 806.595703125, - "completions/mean_terminated_length": 763.961669921875, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 935.556640625, + "completions/mean_terminated_length": 908.8580322265625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, "epoch": 0.7145173679269438, - "grad_norm": 0.8907326459884644, - "kl": 5.4296875, - "learning_rate": 3.057898498559049e-07, - "loss": 0.3671, - "num_tokens": 1136971111.0, - "reward": 1.89013671875, - "reward_std": 0.4978489279747009, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.18254666030406952, + "grad_norm": 3.367060661315918, + "kl": 2.111328125, + "learning_rate": 3.059328479700303e-07, + "loss": 0.1102, + "num_tokens": 1176055812.0, + "reward": 1.09228515625, + "reward_std": 0.31176435947418213, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.177461177110672, "step": 2093 }, { @@ -60712,27 +60712,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 818.623046875, - "completions/mean_terminated_length": 763.426513671875, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 936.71875, + "completions/mean_terminated_length": 921.3148803710938, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, "epoch": 0.7148587522403346, - "grad_norm": 1.2013685703277588, - "kl": 6.78125, - "learning_rate": 3.053397281925244e-07, - "loss": 0.4381, - "num_tokens": 1137464518.0, - "reward": 1.86572265625, - "reward_std": 0.5542969107627869, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.19833672046661377, + "grad_norm": 3.9183101654052734, + "kl": 2.189453125, + "learning_rate": 3.054824454436818e-07, + "loss": 0.1391, + "num_tokens": 1176609684.0, + "reward": 1.08251953125, + "reward_std": 0.3413243591785431, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.19126836955547333, "step": 2094 }, { @@ -60741,27 +60741,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 842.0859375, - "completions/mean_terminated_length": 780.1807250976562, - "completions/min_length": 52.0, - "completions/min_terminated_length": 52.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 916.521484375, + "completions/mean_terminated_length": 905.3629150390625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.7152001365537254, - "grad_norm": 1.0693302154541016, - "kl": 5.84375, - "learning_rate": 3.0488995377965064e-07, - "loss": 0.373, - "num_tokens": 1137967250.0, - "reward": 1.84423828125, - "reward_std": 0.5321292281150818, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.1979798823595047, + "grad_norm": 1.8598436117172241, + "kl": 1.609375, + "learning_rate": 3.0503239022864327e-07, + "loss": 0.069, + "num_tokens": 1177150527.0, + "reward": 1.0693359375, + "reward_std": 0.3265714943408966, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.1628410518169403, "step": 2095 }, { @@ -60770,27 +60770,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 858.94921875, - "completions/mean_terminated_length": 815.6234741210938, - "completions/min_length": 54.0, - "completions/min_terminated_length": 54.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 971.244140625, + "completions/mean_terminated_length": 947.602783203125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, "epoch": 0.7155415208671162, - "grad_norm": 1.3301665782928467, - "kl": 5.2265625, - "learning_rate": 3.0444052725565614e-07, - "loss": 0.3372, - "num_tokens": 1138483112.0, - "reward": 1.85498046875, - "reward_std": 0.546319842338562, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, + "grad_norm": 1.6463770866394043, + "kl": 2.41015625, + "learning_rate": 3.045826829641701e-07, + "loss": 0.1041, + "num_tokens": 1177723884.0, + "reward": 1.10693359375, + "reward_std": 0.3826766014099121, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19079817831516266, + "rewards/tag_count_reward/std": 0.1888652741909027, "step": 2096 }, { @@ -60799,27 +60799,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1911.0, - "completions/mean_length": 789.494140625, - "completions/mean_terminated_length": 743.6376953125, - "completions/min_length": 179.0, - "completions/min_terminated_length": 179.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 838.83203125, + "completions/mean_terminated_length": 834.0902709960938, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.7158829051805069, - "grad_norm": 1.5741279125213623, - "kl": 6.3671875, - "learning_rate": 3.0399144925841993e-07, - "loss": 0.3954, - "num_tokens": 1138958373.0, - "reward": 1.91943359375, - "reward_std": 0.517371416091919, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.18783032894134521, + "grad_norm": 2.6229939460754395, + "kl": 1.767578125, + "learning_rate": 3.0413332428902437e-07, + "loss": 0.0647, + "num_tokens": 1178224406.0, + "reward": 1.15576171875, + "reward_std": 0.34136268496513367, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.95458984375, + "rewards/tag_count_reward/std": 0.14592784643173218, "step": 2097 }, { @@ -60828,27 +60828,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 748.955078125, - "completions/mean_terminated_length": 733.5513916015625, - "completions/min_length": 87.0, - "completions/min_terminated_length": 87.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 840.658203125, + "completions/mean_terminated_length": 821.4940795898438, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.7162242894938977, - "grad_norm": 1.4236983060836792, - "kl": 6.2109375, - "learning_rate": 3.0354272042532573e-07, - "loss": 0.369, - "num_tokens": 1139425758.0, - "reward": 1.81103515625, - "reward_std": 0.5365231037139893, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.19397197663784027, + "grad_norm": 2.4416191577911377, + "kl": 2.51171875, + "learning_rate": 3.036843148414722e-07, + "loss": 0.1533, + "num_tokens": 1178738743.0, + "reward": 1.05078125, + "reward_std": 0.2867480516433716, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.951171875, + "rewards/tag_count_reward/std": 0.15814046561717987, "step": 2098 }, { @@ -60857,27 +60857,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 768.205078125, - "completions/mean_terminated_length": 732.2268676757812, - "completions/min_length": 71.0, - "completions/min_terminated_length": 71.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 897.134765625, + "completions/mean_terminated_length": 869.5140380859375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.7165656738072885, - "grad_norm": 1.2227526903152466, - "kl": 6.6796875, - "learning_rate": 3.030943413932624e-07, - "loss": 0.4388, - "num_tokens": 1139892087.0, - "reward": 1.8447265625, - "reward_std": 0.5190199613571167, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.19347688555717468, + "grad_norm": 2.347712516784668, + "kl": 3.140625, + "learning_rate": 3.032356552592841e-07, + "loss": 0.1899, + "num_tokens": 1179271084.0, + "reward": 1.07177734375, + "reward_std": 0.3246845602989197, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.19546380639076233, "step": 2099 }, { @@ -60886,27 +60886,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 823.9765625, - "completions/mean_terminated_length": 771.6253051757812, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 953.275390625, + "completions/mean_terminated_length": 935.8988647460938, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.7169070581206793, - "grad_norm": 0.9703647494316101, - "kl": 7.328125, - "learning_rate": 3.0264631279862183e-07, - "loss": 0.4783, - "num_tokens": 1140382683.0, - "reward": 1.90625, - "reward_std": 0.586249589920044, - "rewards/accuracy_reward/mean": 0.13709677755832672, - "rewards/accuracy_reward/std": 0.34429675340652466, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.18882036209106445, + "grad_norm": 3.475773811340332, + "kl": 3.20703125, + "learning_rate": 3.027873461797334e-07, + "loss": 0.1539, + "num_tokens": 1179827881.0, + "reward": 1.10546875, + "reward_std": 0.382254421710968, + "rewards/accuracy_reward/mean": 0.1391129046678543, + "rewards/accuracy_reward/std": 0.34641367197036743, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.19541551172733307, "step": 2100 }, { @@ -60915,27 +60915,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 883.41015625, - "completions/mean_terminated_length": 831.1224365234375, - "completions/min_length": 51.0, - "completions/min_terminated_length": 51.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1021.12109375, + "completions/mean_terminated_length": 990.1287231445312, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.7172484424340702, - "grad_norm": 1.013115406036377, - "kl": 5.859375, - "learning_rate": 3.021986352772985e-07, - "loss": 0.3741, - "num_tokens": 1140916173.0, - "reward": 1.8623046875, - "reward_std": 0.5495156049728394, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.21240492165088654, + "grad_norm": 4.873970985412598, + "kl": 2.4296875, + "learning_rate": 3.023393882395959e-07, + "loss": 0.1051, + "num_tokens": 1180431879.0, + "reward": 1.048828125, + "reward_std": 0.3608614206314087, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.19246920943260193, "step": 2101 }, { @@ -60944,27 +60944,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 744.212890625, - "completions/mean_terminated_length": 715.5868530273438, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 912.314453125, + "completions/mean_terminated_length": 882.7274780273438, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.717589826747461, - "grad_norm": 1.1057240962982178, - "kl": 4.38671875, - "learning_rate": 3.0175130946468894e-07, - "loss": 0.2734, - "num_tokens": 1141380762.0, - "reward": 1.9521484375, - "reward_std": 0.4787192642688751, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.90625, - "rewards/format_reward/std": 0.29176566004753113, - "rewards/tag_count_reward/mean": 0.9482421875, - "rewards/tag_count_reward/std": 0.1655413806438446, + "grad_norm": 3.9878900051116943, + "kl": 3.9453125, + "learning_rate": 3.018917820751481e-07, + "loss": 0.2186, + "num_tokens": 1180982536.0, + "reward": 1.09765625, + "reward_std": 0.3582857847213745, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.18201786279678345, "step": 2102 }, { @@ -60973,27 +60973,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1883.0, - "completions/mean_length": 824.912109375, - "completions/mean_terminated_length": 780.34619140625, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 931.134765625, + "completions/mean_terminated_length": 915.6535034179688, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.7179312110608518, - "grad_norm": 0.7997674942016602, - "kl": 5.7421875, - "learning_rate": 3.013043359956903e-07, - "loss": 0.3634, - "num_tokens": 1141880909.0, - "reward": 1.9013671875, - "reward_std": 0.5231031179428101, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.18733429908752441, + "grad_norm": 2.69391131401062, + "kl": 2.623046875, + "learning_rate": 3.0144452832216776e-07, + "loss": 0.1179, + "num_tokens": 1181537069.0, + "reward": 1.11865234375, + "reward_std": 0.3504186272621155, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.17149166762828827, "step": 2103 }, { @@ -61002,27 +61002,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 857.919921875, - "completions/mean_terminated_length": 812.0547485351562, - "completions/min_length": 114.0, - "completions/min_terminated_length": 114.0, + "completions/max_terminated_length": 1902.0, + "completions/mean_length": 958.326171875, + "completions/mean_terminated_length": 945.4051513671875, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, "epoch": 0.7182725953742426, - "grad_norm": 0.9730769991874695, - "kl": 7.140625, - "learning_rate": 3.008577155046997e-07, - "loss": 0.458, - "num_tokens": 1142396996.0, - "reward": 1.845703125, - "reward_std": 0.544554591178894, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.19783563911914825, + "grad_norm": 1.4373724460601807, + "kl": 2.30078125, + "learning_rate": 3.00997627615931e-07, + "loss": 0.1204, + "num_tokens": 1182104564.0, + "reward": 1.0830078125, + "reward_std": 0.3346357047557831, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.17451083660125732, "step": 2104 }, { @@ -61031,27 +61031,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 805.833984375, - "completions/mean_terminated_length": 750.063232421875, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 935.453125, + "completions/mean_terminated_length": 917.793701171875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, "epoch": 0.7186139796876334, - "grad_norm": 0.9937102198600769, - "kl": 6.703125, - "learning_rate": 3.0041144862561307e-07, - "loss": 0.4164, - "num_tokens": 1142889791.0, - "reward": 1.8515625, - "reward_std": 0.5155594348907471, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.19810594618320465, + "grad_norm": 7.588167190551758, + "kl": 2.53125, + "learning_rate": 3.005510805912133e-07, + "loss": 0.0835, + "num_tokens": 1182663724.0, + "reward": 1.0927734375, + "reward_std": 0.3213024437427521, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.16808471083641052, "step": 2105 }, { @@ -61060,27 +61060,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1930.0, - "completions/mean_length": 829.451171875, - "completions/mean_terminated_length": 795.1947631835938, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 971.388671875, + "completions/mean_terminated_length": 952.1251831054688, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, "epoch": 0.7189553640010241, - "grad_norm": 0.8206349611282349, - "kl": 6.078125, - "learning_rate": 2.9996553599182487e-07, - "loss": 0.3906, - "num_tokens": 1143390134.0, - "reward": 1.86279296875, - "reward_std": 0.44854938983917236, - "rewards/accuracy_reward/mean": 0.04233871027827263, - "rewards/accuracy_reward/std": 0.2015640139579773, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.1871933490037918, + "grad_norm": 3.119499921798706, + "kl": 2.05078125, + "learning_rate": 3.001048878822872e-07, + "loss": 0.0841, + "num_tokens": 1183236739.0, + "reward": 1.0673828125, + "reward_std": 0.3247576355934143, + "rewards/accuracy_reward/mean": 0.0786290317773819, + "rewards/accuracy_reward/std": 0.26943066716194153, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.16733261942863464, "step": 2106 }, { @@ -61089,27 +61089,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 814.73046875, - "completions/mean_terminated_length": 772.3757934570312, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 968.78125, + "completions/mean_terminated_length": 940.6653442382812, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, "epoch": 0.7192967483144149, - "grad_norm": 1.5673121213912964, - "kl": 5.7890625, - "learning_rate": 2.995199782362264e-07, - "loss": 0.372, - "num_tokens": 1143887468.0, - "reward": 1.87353515625, - "reward_std": 0.5153354406356812, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.17883430421352386, + "grad_norm": 1.7402732372283936, + "kl": 2.09375, + "learning_rate": 2.996590501229224e-07, + "loss": 0.1197, + "num_tokens": 1183812947.0, + "reward": 1.099609375, + "reward_std": 0.3466918468475342, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.935546875, + "rewards/tag_count_reward/std": 0.17203198373317719, "step": 2107 }, { @@ -61118,27 +61118,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1857.0, - "completions/mean_length": 869.26953125, - "completions/mean_terminated_length": 818.8554077148438, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1015.87890625, + "completions/mean_terminated_length": 997.4114990234375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, "epoch": 0.7196381326278057, - "grad_norm": 1.3445322513580322, - "kl": 8.1953125, - "learning_rate": 2.9907477599120537e-07, - "loss": 0.5169, - "num_tokens": 1144407734.0, - "reward": 1.8125, - "reward_std": 0.5680813789367676, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.20518557727336884, + "grad_norm": 3.0502305030822754, + "kl": 1.564453125, + "learning_rate": 2.99213567946384e-07, + "loss": 0.1003, + "num_tokens": 1184408277.0, + "reward": 1.07373046875, + "reward_std": 0.3070228695869446, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.92919921875, + "rewards/tag_count_reward/std": 0.17967121303081512, "step": 2108 }, { @@ -61147,27 +61147,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1883.0, - "completions/mean_length": 797.939453125, - "completions/mean_terminated_length": 752.3906860351562, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 920.609375, + "completions/mean_terminated_length": 909.4911499023438, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.7199795169411966, - "grad_norm": 1.546714425086975, - "kl": 5.421875, - "learning_rate": 2.98629929888645e-07, - "loss": 0.3528, - "num_tokens": 1144889079.0, - "reward": 1.8740234375, - "reward_std": 0.5286086797714233, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.9306640625, - "rewards/tag_count_reward/std": 0.189181849360466, + "grad_norm": 2.784146785736084, + "kl": 1.6171875, + "learning_rate": 2.9876844198543266e-07, + "loss": 0.0961, + "num_tokens": 1184952429.0, + "reward": 1.09423828125, + "reward_std": 0.29471534490585327, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.94775390625, + "rewards/tag_count_reward/std": 0.16427458822727203, "step": 2109 }, { @@ -61176,27 +61176,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 816.01171875, - "completions/mean_terminated_length": 752.7680053710938, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 980.408203125, + "completions/mean_terminated_length": 952.59521484375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, "epoch": 0.7203209012545874, - "grad_norm": 1.4555652141571045, - "kl": 8.5703125, - "learning_rate": 2.981854405599228e-07, - "loss": 0.5657, - "num_tokens": 1145389805.0, - "reward": 1.7900390625, - "reward_std": 0.5721191167831421, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.20736047625541687, + "grad_norm": 1.8828357458114624, + "kl": 2.0859375, + "learning_rate": 2.983236728723224e-07, + "loss": 0.1392, + "num_tokens": 1185537326.0, + "reward": 1.0439453125, + "reward_std": 0.3143582344055176, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.18964596092700958, "step": 2110 }, { @@ -61205,27 +61205,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1891.0, - "completions/mean_length": 737.791015625, - "completions/mean_terminated_length": 684.5304565429688, - "completions/min_length": 29.0, - "completions/min_terminated_length": 29.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 898.091796875, + "completions/mean_terminated_length": 875.185302734375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.7206622855679782, - "grad_norm": 16.444395065307617, - "kl": 6.9140625, - "learning_rate": 2.9774130863591035e-07, - "loss": 0.4781, - "num_tokens": 1145849234.0, - "reward": 1.81494140625, - "reward_std": 0.4680173397064209, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.18794220685958862, + "grad_norm": 1.9902719259262085, + "kl": 1.7890625, + "learning_rate": 2.9787926123880097e-07, + "loss": 0.1408, + "num_tokens": 1186078829.0, + "reward": 1.05126953125, + "reward_std": 0.27194079756736755, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.94189453125, + "rewards/tag_count_reward/std": 0.16304263472557068, "step": 2111 }, { @@ -61234,27 +61234,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 804.78125, - "completions/mean_terminated_length": 754.243896484375, - "completions/min_length": 27.0, - "completions/min_terminated_length": 27.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 921.8671875, + "completions/mean_terminated_length": 910.7613525390625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, "epoch": 0.721003669881369, - "grad_norm": 1.175093650817871, - "kl": 7.2265625, - "learning_rate": 2.9729753474697157e-07, - "loss": 0.4748, - "num_tokens": 1146337218.0, - "reward": 1.87353515625, - "reward_std": 0.5310930013656616, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.19259266555309296, + "grad_norm": 5.46536111831665, + "kl": 1.5556640625, + "learning_rate": 2.974352077161077e-07, + "loss": 0.1022, + "num_tokens": 1186626761.0, + "reward": 1.10546875, + "reward_std": 0.30975374579429626, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.1631019562482834, "step": 2112 }, { @@ -61263,27 +61263,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 774.974609375, - "completions/mean_terminated_length": 733.9092407226562, - "completions/min_length": 2.0, - "completions/min_terminated_length": 2.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 935.671875, + "completions/mean_terminated_length": 915.7693481445312, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.7213450541947598, - "grad_norm": 1.9142338037490845, - "kl": 5.5703125, - "learning_rate": 2.9685411952296214e-07, - "loss": 0.3121, - "num_tokens": 1146808229.0, - "reward": 1.90380859375, - "reward_std": 0.543915867805481, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.19233450293540955, + "grad_norm": 4.400142669677734, + "kl": 2.126953125, + "learning_rate": 2.96991512934974e-07, + "loss": 0.0673, + "num_tokens": 1187180049.0, + "reward": 1.14013671875, + "reward_std": 0.38579443097114563, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.17150279879570007, "step": 2113 }, { @@ -61292,27 +61292,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1875.0, - "completions/mean_length": 832.79296875, - "completions/mean_terminated_length": 770.4107055664062, - "completions/min_length": 204.0, - "completions/min_terminated_length": 204.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 952.654296875, + "completions/mean_terminated_length": 935.2678833007812, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.7216864385081505, - "grad_norm": 1.2744873762130737, - "kl": 7.84375, - "learning_rate": 2.964110635932292e-07, - "loss": 0.5237, - "num_tokens": 1147305019.0, - "reward": 1.8408203125, - "reward_std": 0.5451600551605225, - "rewards/accuracy_reward/mean": 0.05645161122083664, - "rewards/accuracy_reward/std": 0.23102474212646484, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.20382823050022125, + "grad_norm": 1.9835413694381714, + "kl": 1.857421875, + "learning_rate": 2.965481775256211e-07, + "loss": 0.1005, + "num_tokens": 1187738208.0, + "reward": 1.04638671875, + "reward_std": 0.29439234733581543, + "rewards/accuracy_reward/mean": 0.07459677755832672, + "rewards/accuracy_reward/std": 0.263004869222641, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.17079374194145203, "step": 2114 }, { @@ -61321,27 +61321,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1978.0, - "completions/mean_length": 821.94140625, - "completions/mean_terminated_length": 753.6866455078125, - "completions/min_length": 38.0, - "completions/min_terminated_length": 38.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 972.103515625, + "completions/mean_terminated_length": 937.3971557617188, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.7220278228215413, - "grad_norm": 3.1961309909820557, - "kl": 8.390625, - "learning_rate": 2.959683675866093e-07, - "loss": 0.5083, - "num_tokens": 1147805693.0, - "reward": 1.83544921875, - "reward_std": 0.5922591686248779, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.21071286499500275, + "grad_norm": 2.0127451419830322, + "kl": 1.984375, + "learning_rate": 2.9610520211776e-07, + "loss": 0.0994, + "num_tokens": 1188315765.0, + "reward": 1.1064453125, + "reward_std": 0.3446625769138336, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.18495242297649384, "step": 2115 }, { @@ -61350,27 +61350,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1888.0, - "completions/mean_length": 759.109375, - "completions/mean_terminated_length": 730.8103637695312, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 946.49609375, + "completions/mean_terminated_length": 920.06005859375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.7223692071349321, - "grad_norm": 1.4142956733703613, - "kl": 5.3984375, - "learning_rate": 2.955260321314287e-07, - "loss": 0.3445, - "num_tokens": 1148266917.0, - "reward": 1.88916015625, - "reward_std": 0.47626230120658875, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.94384765625, - "rewards/tag_count_reward/std": 0.1745731234550476, + "grad_norm": 1.6072068214416504, + "kl": 2.466796875, + "learning_rate": 2.956625873405905e-07, + "loss": 0.1435, + "num_tokens": 1188872931.0, + "reward": 1.06494140625, + "reward_std": 0.33009999990463257, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.18012270331382751, "step": 2116 }, { @@ -61379,27 +61379,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1935.0, - "completions/mean_length": 713.673828125, - "completions/mean_terminated_length": 676.16259765625, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 825.26171875, + "completions/mean_terminated_length": 813.2031860351562, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.722710591448323, - "grad_norm": 1.2257810831069946, - "kl": 6.125, - "learning_rate": 2.9508405785550144e-07, - "loss": 0.4101, - "num_tokens": 1148715262.0, - "reward": 1.89404296875, - "reward_std": 0.49057528376579285, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.17711623013019562, + "grad_norm": 3.4244606494903564, + "kl": 1.953125, + "learning_rate": 2.952203338228002e-07, + "loss": 0.1117, + "num_tokens": 1189378409.0, + "reward": 1.09423828125, + "reward_std": 0.32503634691238403, + "rewards/accuracy_reward/mean": 0.13508065044879913, + "rewards/accuracy_reward/std": 0.3421548008918762, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.16826297342777252, "step": 2117 }, { @@ -61408,27 +61408,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 745.078125, - "completions/mean_terminated_length": 729.6284790039062, - "completions/min_length": 101.0, - "completions/min_terminated_length": 101.0, + "completions/max_terminated_length": 1866.0, + "completions/mean_length": 896.767578125, + "completions/mean_terminated_length": 873.834716796875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.7230519757617138, - "grad_norm": 1.1388176679611206, - "kl": 4.765625, - "learning_rate": 2.946424453861294e-07, - "loss": 0.2873, - "num_tokens": 1149176006.0, - "reward": 1.90673828125, - "reward_std": 0.5317778587341309, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.1781490594148636, + "grad_norm": 3.5652318000793457, + "kl": 2.318359375, + "learning_rate": 2.947784421925631e-07, + "loss": 0.1761, + "num_tokens": 1189916818.0, + "reward": 1.125, + "reward_std": 0.3374006748199463, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.16621248424053192, "step": 2118 }, { @@ -61437,27 +61437,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 824.03125, - "completions/mean_terminated_length": 784.54833984375, - "completions/min_length": 227.0, - "completions/min_terminated_length": 227.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 999.025390625, + "completions/mean_terminated_length": 967.3661499023438, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, "epoch": 0.7233933600751046, - "grad_norm": 0.9671112895011902, - "kl": 6.1171875, - "learning_rate": 2.942011953501007e-07, - "loss": 0.3821, - "num_tokens": 1149684438.0, - "reward": 1.796875, - "reward_std": 0.5368913412094116, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.21601147949695587, + "grad_norm": 1.6633632183074951, + "kl": 2.984375, + "learning_rate": 2.943369130775399e-07, + "loss": 0.1836, + "num_tokens": 1190514847.0, + "reward": 1.0029296875, + "reward_std": 0.3062749207019806, + "rewards/accuracy_reward/mean": 0.058467742055654526, + "rewards/accuracy_reward/std": 0.23486268520355225, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.17416280508041382, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.20237918198108673, "step": 2119 }, { @@ -61466,27 +61466,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, - "completions/mean_length": 788.46484375, - "completions/mean_terminated_length": 755.6513061523438, - "completions/min_length": 53.0, - "completions/min_terminated_length": 53.0, + "completions/mean_length": 947.65625, + "completions/mean_terminated_length": 921.248046875, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.7237347443884954, - "grad_norm": 1.0110492706298828, - "kl": 6.1875, - "learning_rate": 2.937603083736887e-07, - "loss": 0.4005, - "num_tokens": 1150163460.0, - "reward": 1.8330078125, - "reward_std": 0.5243314504623413, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.19744645059108734, + "grad_norm": 1.7128705978393555, + "kl": 1.9453125, + "learning_rate": 2.9389574710487547e-07, + "loss": 0.1179, + "num_tokens": 1191075375.0, + "reward": 1.07177734375, + "reward_std": 0.3148093819618225, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.17007611691951752, "step": 2120 }, { @@ -61495,27 +61495,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1970.0, - "completions/mean_length": 792.935546875, - "completions/mean_terminated_length": 744.56591796875, - "completions/min_length": 83.0, - "completions/min_terminated_length": 83.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 939.98046875, + "completions/mean_terminated_length": 926.8419189453125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.7240761287018862, - "grad_norm": 1.0699777603149414, - "kl": 5.484375, - "learning_rate": 2.9331978508265225e-07, - "loss": 0.3312, - "num_tokens": 1150658115.0, - "reward": 1.8525390625, - "reward_std": 0.5569126009941101, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.18362505733966827, + "grad_norm": 3.762661933898926, + "kl": 2.34375, + "learning_rate": 2.934549449011997e-07, + "loss": 0.111, + "num_tokens": 1191645317.0, + "reward": 1.083984375, + "reward_std": 0.3455064296722412, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.17952290177345276, "step": 2121 }, { @@ -61524,27 +61524,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1816.0, - "completions/mean_length": 799.392578125, - "completions/mean_terminated_length": 764.2911376953125, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1003.40234375, + "completions/mean_terminated_length": 982.5936279296875, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, "epoch": 0.7244175130152769, - "grad_norm": 2.0671730041503906, - "kl": 5.00390625, - "learning_rate": 2.9287962610223326e-07, - "loss": 0.3268, - "num_tokens": 1151145644.0, - "reward": 1.94677734375, - "reward_std": 0.5002008676528931, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.17777857184410095, + "grad_norm": 5.650506019592285, + "kl": 3.041015625, + "learning_rate": 2.930145070926254e-07, + "loss": 0.2066, + "num_tokens": 1192237299.0, + "reward": 1.134765625, + "reward_std": 0.36895841360092163, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.20257031917572021, "step": 2122 }, { @@ -61553,27 +61553,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1828.0, - "completions/mean_length": 750.310546875, - "completions/mean_terminated_length": 705.7434692382812, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 946.919921875, + "completions/mean_terminated_length": 922.7445068359375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.7247588973286677, - "grad_norm": 2.3090503215789795, - "kl": 5.078125, - "learning_rate": 2.924398320571573e-07, - "loss": 0.3454, - "num_tokens": 1151608779.0, - "reward": 1.8623046875, - "reward_std": 0.4841301143169403, - "rewards/accuracy_reward/mean": 0.04583333432674408, - "rewards/accuracy_reward/std": 0.20934167504310608, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.17308135330677032, + "grad_norm": 6.777873516082764, + "kl": 2.32421875, + "learning_rate": 2.92574434304748e-07, + "loss": 0.0759, + "num_tokens": 1192801098.0, + "reward": 1.07080078125, + "reward_std": 0.36891961097717285, + "rewards/accuracy_reward/mean": 0.0833333358168602, + "rewards/accuracy_reward/std": 0.2766737639904022, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.192215234041214, "step": 2123 }, { @@ -61582,27 +61582,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 794.26171875, - "completions/mean_terminated_length": 761.5991821289062, - "completions/min_length": 182.0, - "completions/min_terminated_length": 182.0, + "completions/max_terminated_length": 1851.0, + "completions/mean_length": 933.951171875, + "completions/mean_terminated_length": 914.017822265625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.7251002816420585, - "grad_norm": 1.55347740650177, - "kl": 7.2578125, - "learning_rate": 2.9200040357163114e-07, - "loss": 0.4675, - "num_tokens": 1152091841.0, - "reward": 1.81689453125, - "reward_std": 0.5591763257980347, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.19693946838378906, + "grad_norm": 3.0265586376190186, + "kl": 3.3984375, + "learning_rate": 2.921347271626442e-07, + "loss": 0.1887, + "num_tokens": 1193355681.0, + "reward": 1.087890625, + "reward_std": 0.3291592299938202, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.17828376591205597, "step": 2124 }, { @@ -61611,27 +61611,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 748.294921875, - "completions/mean_terminated_length": 709.068359375, - "completions/min_length": 85.0, - "completions/min_terminated_length": 85.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 903.271484375, + "completions/mean_terminated_length": 882.7892456054688, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.7254416659554493, - "grad_norm": 3.6218671798706055, - "kl": 7.74609375, - "learning_rate": 2.915613412693435e-07, - "loss": 0.4376, - "num_tokens": 1152552744.0, - "reward": 1.8427734375, - "reward_std": 0.5550713539123535, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.20478235185146332, + "grad_norm": 7.29146671295166, + "kl": 2.603515625, + "learning_rate": 2.9169538629087153e-07, + "loss": 0.0974, + "num_tokens": 1193895932.0, + "reward": 1.12109375, + "reward_std": 0.32780325412750244, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.951171875, + "rewards/tag_count_reward/std": 0.15422488749027252, "step": 2125 }, { @@ -61640,27 +61640,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1860.0, - "completions/mean_length": 750.392578125, - "completions/mean_terminated_length": 721.9022216796875, - "completions/min_length": 69.0, - "completions/min_terminated_length": 69.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 940.7890625, + "completions/mean_terminated_length": 929.8698120117188, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, "epoch": 0.7257830502688402, - "grad_norm": 2.598684310913086, - "kl": 6.4765625, - "learning_rate": 2.911226457734628e-07, - "loss": 0.349, - "num_tokens": 1153013601.0, - "reward": 1.85595703125, - "reward_std": 0.5394971370697021, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.1892392933368683, + "grad_norm": 4.0576066970825195, + "kl": 2.369140625, + "learning_rate": 2.912564123134671e-07, + "loss": 0.1003, + "num_tokens": 1194454272.0, + "reward": 1.08984375, + "reward_std": 0.31391602754592896, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.16913031041622162, "step": 2126 }, { @@ -61669,27 +61669,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1960.0, - "completions/mean_length": 809.75390625, - "completions/mean_terminated_length": 754.1591796875, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 972.427734375, + "completions/mean_terminated_length": 951.0020141601562, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, "epoch": 0.726124434582231, - "grad_norm": 3.110694646835327, - "kl": 8.6328125, - "learning_rate": 2.90684317706637e-07, - "loss": 0.4948, - "num_tokens": 1153516003.0, - "reward": 1.75146484375, - "reward_std": 0.604299008846283, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.794921875, - "rewards/format_reward/std": 0.4041535556316376, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.21333131194114685, + "grad_norm": 2.921215295791626, + "kl": 2.171875, + "learning_rate": 2.9081780585394694e-07, + "loss": 0.0994, + "num_tokens": 1195039963.0, + "reward": 1.08349609375, + "reward_std": 0.3452760875225067, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.16121558845043182, "step": 2127 }, { @@ -61698,27 +61698,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 767.173828125, - "completions/mean_terminated_length": 728.51708984375, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 940.39453125, + "completions/mean_terminated_length": 902.3555908203125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.7264658188956218, - "grad_norm": 1.4040441513061523, - "kl": 6.90625, - "learning_rate": 2.9024635769099287e-07, - "loss": 0.427, - "num_tokens": 1153991452.0, - "reward": 1.84033203125, - "reward_std": 0.5145344138145447, - "rewards/accuracy_reward/mean": 0.0463709682226181, - "rewards/accuracy_reward/std": 0.21049949526786804, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.18341873586177826, + "grad_norm": 2.3903448581695557, + "kl": 2.72265625, + "learning_rate": 2.9037956753530534e-07, + "loss": 0.1544, + "num_tokens": 1195604101.0, + "reward": 1.05712890625, + "reward_std": 0.3574679493904114, + "rewards/accuracy_reward/mean": 0.0947580635547638, + "rewards/accuracy_reward/std": 0.29317617416381836, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.19675986468791962, "step": 2128 }, { @@ -61727,27 +61727,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 761.314453125, - "completions/mean_terminated_length": 719.8084716796875, - "completions/min_length": 24.0, - "completions/min_terminated_length": 24.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 887.892578125, + "completions/mean_terminated_length": 874.1364135742188, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, "epoch": 0.7268072032090126, - "grad_norm": 2.685636043548584, - "kl": 7.5625, - "learning_rate": 2.8980876634813424e-07, - "loss": 0.4497, - "num_tokens": 1154456013.0, - "reward": 1.80126953125, - "reward_std": 0.5381090044975281, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.19397197663784027, + "grad_norm": 2.3155248165130615, + "kl": 1.8515625, + "learning_rate": 2.8994169798001334e-07, + "loss": 0.0863, + "num_tokens": 1196133470.0, + "reward": 1.0322265625, + "reward_std": 0.27031049132347107, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.9482421875, + "rewards/tag_count_reward/std": 0.16180500388145447, "step": 2129 }, { @@ -61756,27 +61756,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1924.0, - "completions/mean_length": 778.046875, - "completions/mean_terminated_length": 750.1636962890625, - "completions/min_length": 54.0, - "completions/min_terminated_length": 54.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 921.6015625, + "completions/mean_terminated_length": 903.7222900390625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.7271485875224033, - "grad_norm": 1.834752082824707, - "kl": 5.53125, - "learning_rate": 2.8937154429914233e-07, - "loss": 0.3485, - "num_tokens": 1154933397.0, - "reward": 1.85888671875, - "reward_std": 0.5585477352142334, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.18635430932044983, + "grad_norm": 1.7204222679138184, + "kl": 2.94921875, + "learning_rate": 2.895041978100182e-07, + "loss": 0.161, + "num_tokens": 1196684354.0, + "reward": 1.0810546875, + "reward_std": 0.37718504667282104, + "rewards/accuracy_reward/mean": 0.1270161271095276, + "rewards/accuracy_reward/std": 0.3333272337913513, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.20076745748519897, "step": 2130 }, { @@ -61785,27 +61785,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 764.798828125, - "completions/mean_terminated_length": 741.8389282226562, - "completions/min_length": 179.0, - "completions/min_terminated_length": 179.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 942.431640625, + "completions/mean_terminated_length": 927.10693359375, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, "epoch": 0.7274899718357941, - "grad_norm": 1.4363107681274414, - "kl": 4.98046875, - "learning_rate": 2.889346921645737e-07, - "loss": 0.3136, - "num_tokens": 1155397854.0, - "reward": 1.90673828125, - "reward_std": 0.5185901522636414, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.1828918159008026, + "grad_norm": 2.0065958499908447, + "kl": 1.8681640625, + "learning_rate": 2.8906706764674294e-07, + "loss": 0.089, + "num_tokens": 1197239759.0, + "reward": 1.146484375, + "reward_std": 0.3562297821044922, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.947265625, + "rewards/tag_count_reward/std": 0.15531150996685028, "step": 2131 }, { @@ -61814,27 +61814,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 907.546875, - "completions/mean_terminated_length": 870.758056640625, - "completions/min_length": 211.0, - "completions/min_terminated_length": 211.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1077.025390625, + "completions/mean_terminated_length": 1061.6131591796875, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, "epoch": 0.7278313561491849, - "grad_norm": 1.5458734035491943, - "kl": 4.8984375, - "learning_rate": 2.8849821056445983e-07, - "loss": 0.2985, - "num_tokens": 1155941974.0, - "reward": 1.8466796875, - "reward_std": 0.4652012586593628, - "rewards/accuracy_reward/mean": 0.03427419438958168, - "rewards/accuracy_reward/std": 0.18211629986763, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.1911715567111969, + "grad_norm": 1.905436635017395, + "kl": 2.525390625, + "learning_rate": 2.886303081110842e-07, + "loss": 0.1406, + "num_tokens": 1197870652.0, + "reward": 1.02880859375, + "reward_std": 0.34721454977989197, + "rewards/accuracy_reward/mean": 0.06451612710952759, + "rewards/accuracy_reward/std": 0.2459181249141693, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.20005404949188232, "step": 2132 }, { @@ -61845,25 +61845,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1965.0, - "completions/mean_length": 848.89453125, - "completions/mean_terminated_length": 805.2024536132812, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1032.8828125, + "completions/mean_terminated_length": 995.894775390625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, "epoch": 0.7281727404625757, - "grad_norm": 0.8934493064880371, - "kl": 5.80859375, - "learning_rate": 2.880621001183069e-07, - "loss": 0.3515, - "num_tokens": 1156451184.0, - "reward": 1.81640625, - "reward_std": 0.48416393995285034, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.20392431318759918, + "grad_norm": 1.2293317317962646, + "kl": 1.908203125, + "learning_rate": 2.88193919823413e-07, + "loss": 0.0942, + "num_tokens": 1198474064.0, + "reward": 1.05419921875, + "reward_std": 0.3445899486541748, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.1913033276796341, "step": 2133 }, { @@ -61872,27 +61872,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 758.8828125, - "completions/mean_terminated_length": 730.578857421875, - "completions/min_length": 83.0, - "completions/min_terminated_length": 83.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 913.3203125, + "completions/mean_terminated_length": 895.3095703125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.7285141247759666, - "grad_norm": 2.6557486057281494, - "kl": 4.7890625, - "learning_rate": 2.8762636144509366e-07, - "loss": 0.2927, - "num_tokens": 1156919844.0, - "reward": 1.82080078125, - "reward_std": 0.5306501388549805, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.18663610517978668, + "grad_norm": 2.019636631011963, + "kl": 2.1123046875, + "learning_rate": 2.8775790340357265e-07, + "loss": 0.1533, + "num_tokens": 1199021796.0, + "reward": 1.00048828125, + "reward_std": 0.27511173486709595, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.017578125, + "rewards/format_reward/std": 0.13154059648513794, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.1966189742088318, "step": 2134 }, { @@ -61901,27 +61901,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 812.04296875, - "completions/mean_terminated_length": 769.5960083007812, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1014.330078125, + "completions/mean_terminated_length": 980.98583984375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.7288555090893574, - "grad_norm": 1.5461405515670776, - "kl": 5.6796875, - "learning_rate": 2.871909951632716e-07, - "loss": 0.3617, - "num_tokens": 1157407946.0, - "reward": 1.8916015625, - "reward_std": 0.5402153730392456, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.9306640625, - "rewards/tag_count_reward/std": 0.18526214361190796, + "grad_norm": 3.724452495574951, + "kl": 2.501953125, + "learning_rate": 2.873222594708785e-07, + "loss": 0.14, + "num_tokens": 1199613469.0, + "reward": 1.0927734375, + "reward_std": 0.38656413555145264, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.21027082204818726, "step": 2135 }, { @@ -61930,27 +61930,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 834.62890625, - "completions/mean_terminated_length": 800.51806640625, - "completions/min_length": 191.0, - "completions/min_terminated_length": 191.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 991.6015625, + "completions/mean_terminated_length": 970.5578002929688, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, "epoch": 0.7291968934027482, - "grad_norm": 1.1518545150756836, - "kl": 5.6875, - "learning_rate": 2.867560018907634e-07, - "loss": 0.3536, - "num_tokens": 1157917660.0, - "reward": 1.8671875, - "reward_std": 0.4876805245876312, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.19139383733272552, + "grad_norm": 1.4995718002319336, + "kl": 1.9453125, + "learning_rate": 2.8688698864411633e-07, + "loss": 0.0956, + "num_tokens": 1200203553.0, + "reward": 1.05615234375, + "reward_std": 0.2746681571006775, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.1804301142692566, "step": 2136 }, { @@ -61959,27 +61959,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 908.12890625, - "completions/mean_terminated_length": 849.614013671875, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1041.615234375, + "completions/mean_terminated_length": 1013.3232421875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.729538277716139, - "grad_norm": 1.1337326765060425, - "kl": 7.015625, - "learning_rate": 2.863213822449629e-07, - "loss": 0.4198, - "num_tokens": 1158467358.0, - "reward": 1.80859375, - "reward_std": 0.5352450609207153, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.20436429977416992, + "grad_norm": 1.7754127979278564, + "kl": 1.8671875, + "learning_rate": 2.864520915415426e-07, + "loss": 0.0969, + "num_tokens": 1200821596.0, + "reward": 1.009765625, + "reward_std": 0.31180790066719055, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15143637359142303, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.19246920943260193, "step": 2137 }, { @@ -61988,27 +61988,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 842.193359375, - "completions/mean_terminated_length": 793.1768188476562, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 986.05859375, + "completions/mean_terminated_length": 960.572021484375, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, "epoch": 0.7298796620295297, - "grad_norm": 2.4628489017486572, - "kl": 7.234375, - "learning_rate": 2.8588713684273247e-07, - "loss": 0.4458, - "num_tokens": 1158976961.0, - "reward": 1.85595703125, - "reward_std": 0.5720502734184265, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.19861595332622528, + "grad_norm": 3.720889091491699, + "kl": 2.173828125, + "learning_rate": 2.8601756878088236e-07, + "loss": 0.0893, + "num_tokens": 1201404858.0, + "reward": 1.0966796875, + "reward_std": 0.37317997217178345, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.1910315454006195, "step": 2138 }, { @@ -62017,27 +62017,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1967.0, - "completions/mean_length": 819.2578125, - "completions/mean_terminated_length": 771.902587890625, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 950.767578125, + "completions/mean_terminated_length": 933.3512573242188, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, "epoch": 0.7302210463429205, - "grad_norm": 1.3563839197158813, - "kl": 5.94140625, - "learning_rate": 2.8545326630040436e-07, - "loss": 0.3266, - "num_tokens": 1159472389.0, - "reward": 1.83837890625, - "reward_std": 0.5455411076545715, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.19709952175617218, + "grad_norm": 2.001533031463623, + "kl": 2.091796875, + "learning_rate": 2.855834209793293e-07, + "loss": 0.1049, + "num_tokens": 1201967619.0, + "reward": 1.06396484375, + "reward_std": 0.27928805351257324, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.17077696323394775, "step": 2139 }, { @@ -62046,27 +62046,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1965.0, - "completions/mean_length": 793.595703125, - "completions/mean_terminated_length": 758.331298828125, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 949.84375, + "completions/mean_terminated_length": 930.1947631835938, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.7305624306563113, - "grad_norm": 1.2304555177688599, - "kl": 5.65625, - "learning_rate": 2.850197712337786e-07, - "loss": 0.3686, - "num_tokens": 1159954070.0, - "reward": 1.9013671875, - "reward_std": 0.5251258611679077, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.9306640625, - "rewards/tag_count_reward/std": 0.18526214361190796, + "grad_norm": 2.0633723735809326, + "kl": 1.595703125, + "learning_rate": 2.851496487535445e-07, + "loss": 0.0531, + "num_tokens": 1202529299.0, + "reward": 1.14990234375, + "reward_std": 0.3061620593070984, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.94482421875, + "rewards/tag_count_reward/std": 0.16180720925331116, "step": 2140 }, { @@ -62075,27 +62075,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 873.259765625, - "completions/mean_terminated_length": 830.4555053710938, - "completions/min_length": 192.0, - "completions/min_terminated_length": 192.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1083.74609375, + "completions/mean_terminated_length": 1044.5487060546875, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, "epoch": 0.7309038149697021, - "grad_norm": 1.5168801546096802, - "kl": 6.171875, - "learning_rate": 2.84586652258122e-07, - "loss": 0.3865, - "num_tokens": 1160479995.0, - "reward": 1.81396484375, - "reward_std": 0.47107118368148804, - "rewards/accuracy_reward/mean": 0.02016128972172737, - "rewards/accuracy_reward/std": 0.14069372415542603, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.1791812628507614, + "grad_norm": 4.550657749176025, + "kl": 2.1162109375, + "learning_rate": 2.8471625271965537e-07, + "loss": 0.1142, + "num_tokens": 1203162993.0, + "reward": 0.9892578125, + "reward_std": 0.314760684967041, + "rewards/accuracy_reward/mean": 0.05040322616696358, + "rewards/accuracy_reward/std": 0.21899643540382385, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.21485605835914612, "step": 2141 }, { @@ -62104,27 +62104,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 787.8828125, - "completions/mean_terminated_length": 739.3184204101562, - "completions/min_length": 69.0, - "completions/min_terminated_length": 69.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 941.640625, + "completions/mean_terminated_length": 928.5217895507812, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.731245199283093, - "grad_norm": 1.4604676961898804, - "kl": 7.3671875, - "learning_rate": 2.841539099881678e-07, - "loss": 0.4537, - "num_tokens": 1160954559.0, - "reward": 1.83935546875, - "reward_std": 0.590590238571167, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.21210047602653503, + "grad_norm": 4.507602214813232, + "kl": 1.615234375, + "learning_rate": 2.842832334932554e-07, + "loss": 0.0229, + "num_tokens": 1203716281.0, + "reward": 1.16015625, + "reward_std": 0.37245625257492065, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.1628674566745758, "step": 2142 }, { @@ -62133,27 +62133,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 819.16015625, - "completions/mean_terminated_length": 756.0780639648438, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 995.8125, + "completions/mean_terminated_length": 948.5714111328125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.7315865835964838, - "grad_norm": 1.804317831993103, - "kl": 8.890625, - "learning_rate": 2.837215450381144e-07, - "loss": 0.5505, - "num_tokens": 1161449489.0, - "reward": 1.80712890625, - "reward_std": 0.583137571811676, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.90478515625, - "rewards/tag_count_reward/std": 0.2224915623664856, + "grad_norm": 4.16478967666626, + "kl": 2.28515625, + "learning_rate": 2.838505916894023e-07, + "loss": 0.1507, + "num_tokens": 1204301657.0, + "reward": 1.07666015625, + "reward_std": 0.3451623320579529, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.19170249998569489, "step": 2143 }, { @@ -62162,27 +62162,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 824.85546875, - "completions/mean_terminated_length": 772.5418090820312, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 970.0703125, + "completions/mean_terminated_length": 937.5371704101562, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, "epoch": 0.7319279679098746, - "grad_norm": 1.139277458190918, - "kl": 6.7734375, - "learning_rate": 2.832895580216249e-07, - "loss": 0.4459, - "num_tokens": 1161948711.0, - "reward": 1.833984375, - "reward_std": 0.4912651777267456, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.19994954764842987, + "grad_norm": 4.07260799407959, + "kl": 2.25, + "learning_rate": 2.834183279226181e-07, + "loss": 0.1491, + "num_tokens": 1204875229.0, + "reward": 1.01025390625, + "reward_std": 0.3133026361465454, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.19093835353851318, "step": 2144 }, { @@ -62191,27 +62191,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 856.619140625, - "completions/mean_terminated_length": 823.12646484375, - "completions/min_length": 52.0, - "completions/min_terminated_length": 52.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1000.810546875, + "completions/mean_terminated_length": 960.4523315429688, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, "epoch": 0.7322693522232654, - "grad_norm": 1.1441174745559692, - "kl": 6.625, - "learning_rate": 2.828579495518256e-07, - "loss": 0.3795, - "num_tokens": 1162463700.0, - "reward": 1.83984375, - "reward_std": 0.49727314710617065, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.19345958530902863, + "grad_norm": 2.2163825035095215, + "kl": 2.39453125, + "learning_rate": 2.829864428068875e-07, + "loss": 0.1145, + "num_tokens": 1205464044.0, + "reward": 1.04931640625, + "reward_std": 0.32089635729789734, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.1950674206018448, "step": 2145 }, { @@ -62220,27 +62220,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 820.228515625, - "completions/mean_terminated_length": 772.9107055664062, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 954.162109375, + "completions/mean_terminated_length": 930.1456909179688, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.7326107365366561, - "grad_norm": 2.1744227409362793, - "kl": 4.58984375, - "learning_rate": 2.824267202413061e-07, - "loss": 0.2925, - "num_tokens": 1162964681.0, - "reward": 1.95166015625, - "reward_std": 0.5318840742111206, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.184224471449852, + "grad_norm": 2.299699544906616, + "kl": 2.341796875, + "learning_rate": 2.825549369556578e-07, + "loss": 0.091, + "num_tokens": 1206033599.0, + "reward": 1.18505859375, + "reward_std": 0.3540371060371399, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.4027182459831238, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.94091796875, + "rewards/tag_count_reward/std": 0.1649305671453476, "step": 2146 }, { @@ -62249,27 +62249,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 852.888671875, - "completions/mean_terminated_length": 829.0817260742188, - "completions/min_length": 37.0, - "completions/min_terminated_length": 37.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1038.044921875, + "completions/mean_terminated_length": 1015.8702392578125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.7329521208500469, - "grad_norm": 1.288240909576416, - "kl": 5.5625, - "learning_rate": 2.8199587070211737e-07, - "loss": 0.2962, - "num_tokens": 1163483440.0, - "reward": 1.78369140625, - "reward_std": 0.5132990479469299, - "rewards/accuracy_reward/mean": 0.04233871027827263, - "rewards/accuracy_reward/std": 0.2015640139579773, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.20086915791034698, + "grad_norm": 3.308039665222168, + "kl": 1.958984375, + "learning_rate": 2.821238109818374e-07, + "loss": 0.0757, + "num_tokens": 1206647158.0, + "reward": 1.060546875, + "reward_std": 0.33447128534317017, + "rewards/accuracy_reward/mean": 0.06653226166963577, + "rewards/accuracy_reward/std": 0.2494617998600006, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.935546875, + "rewards/tag_count_reward/std": 0.17968250811100006, "step": 2147 }, { @@ -62278,27 +62278,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 865.533203125, - "completions/mean_terminated_length": 799.7052001953125, - "completions/min_length": 87.0, - "completions/min_terminated_length": 87.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1010.712890625, + "completions/mean_terminated_length": 981.5521850585938, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, "epoch": 0.7332935051634377, - "grad_norm": 0.9720637202262878, - "kl": 6.9296875, - "learning_rate": 2.815654015457715e-07, - "loss": 0.4364, - "num_tokens": 1164014721.0, - "reward": 1.81396484375, - "reward_std": 0.5638447999954224, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.20743593573570251, + "grad_norm": 3.0816457271575928, + "kl": 2.708984375, + "learning_rate": 2.8169306549779526e-07, + "loss": 0.1623, + "num_tokens": 1207252771.0, + "reward": 1.06103515625, + "reward_std": 0.3866375684738159, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.91455078125, + "rewards/tag_count_reward/std": 0.20666059851646423, "step": 2148 }, { @@ -62307,27 +62307,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 849.125, - "completions/mean_terminated_length": 807.9515380859375, - "completions/min_length": 193.0, - "completions/min_terminated_length": 193.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 988.701171875, + "completions/mean_terminated_length": 969.7474975585938, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, "epoch": 0.7336348894768285, - "grad_norm": 1.410672664642334, - "kl": 7.0078125, - "learning_rate": 2.8113531338324104e-07, - "loss": 0.4254, - "num_tokens": 1164529473.0, - "reward": 1.8828125, - "reward_std": 0.5692057609558105, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.19234009087085724, + "grad_norm": 5.88734769821167, + "kl": 2.41796875, + "learning_rate": 2.8126270111535945e-07, + "loss": 0.1117, + "num_tokens": 1207838986.0, + "reward": 1.125, + "reward_std": 0.3515871465206146, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.18423029780387878, "step": 2149 }, { @@ -62336,27 +62336,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1940.0, - "completions/mean_length": 747.064453125, - "completions/mean_terminated_length": 713.17236328125, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 968.232421875, + "completions/mean_terminated_length": 944.5249633789062, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.7339762737902193, - "grad_norm": 1.658525824546814, - "kl": 5.96875, - "learning_rate": 2.807056068249569e-07, - "loss": 0.3843, - "num_tokens": 1164984818.0, - "reward": 1.8369140625, - "reward_std": 0.578458309173584, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.18692579865455627, + "grad_norm": 2.8372082710266113, + "kl": 2.30859375, + "learning_rate": 2.8083271844581723e-07, + "loss": 0.1119, + "num_tokens": 1208407569.0, + "reward": 1.12548828125, + "reward_std": 0.3805568814277649, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.17115144431591034, "step": 2150 }, { @@ -62365,27 +62365,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1956.0, - "completions/mean_length": 812.7265625, - "completions/mean_terminated_length": 759.8941040039062, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 955.806640625, + "completions/mean_terminated_length": 925.1023559570312, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.7343176581036102, - "grad_norm": 2.316544771194458, - "kl": 7.13671875, - "learning_rate": 2.8027628248080944e-07, - "loss": 0.459, - "num_tokens": 1165473270.0, - "reward": 1.83935546875, - "reward_std": 0.5644776821136475, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.2068454772233963, + "grad_norm": 1.9893885850906372, + "kl": 2.20703125, + "learning_rate": 2.8040311809991334e-07, + "loss": 0.1182, + "num_tokens": 1208969278.0, + "reward": 1.10107421875, + "reward_std": 0.32023048400878906, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.17573511600494385, "step": 2151 }, { @@ -62394,27 +62394,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1841.0, - "completions/mean_length": 772.55859375, - "completions/mean_terminated_length": 741.9480590820312, - "completions/min_length": 78.0, - "completions/min_terminated_length": 78.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 940.81640625, + "completions/mean_terminated_length": 918.760986328125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, "epoch": 0.734659042417001, - "grad_norm": 1.7509030103683472, - "kl": 5.00390625, - "learning_rate": 2.7984734096014567e-07, - "loss": 0.3087, - "num_tokens": 1165943492.0, - "reward": 1.966796875, - "reward_std": 0.5468528270721436, - "rewards/accuracy_reward/mean": 0.162109375, - "rewards/accuracy_reward/std": 0.3689115643501282, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.16988569498062134, + "grad_norm": 4.575304985046387, + "kl": 1.951171875, + "learning_rate": 2.7997390068784967e-07, + "loss": 0.1412, + "num_tokens": 1209525648.0, + "reward": 1.14990234375, + "reward_std": 0.3050912320613861, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, + "rewards/format_reward/mean": 0.025390625, + "rewards/format_reward/std": 0.15746226906776428, + "rewards/tag_count_reward/mean": 0.94482421875, + "rewards/tag_count_reward/std": 0.16405922174453735, "step": 2152 }, { @@ -62423,27 +62423,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 791.978515625, - "completions/mean_terminated_length": 754.0703735351562, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 956.15625, + "completions/mean_terminated_length": 938.825439453125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.7350004267303918, - "grad_norm": 1.330442190170288, - "kl": 6.3203125, - "learning_rate": 2.794187828717698e-07, - "loss": 0.3979, - "num_tokens": 1166420761.0, - "reward": 1.84375, - "reward_std": 0.5745775699615479, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.20278719067573547, + "grad_norm": 2.601768970489502, + "kl": 3.09375, + "learning_rate": 2.795450668192842e-07, + "loss": 0.1907, + "num_tokens": 1210086976.0, + "reward": 1.07373046875, + "reward_std": 0.3354206383228302, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.19207598268985748, "step": 2153 }, { @@ -62452,27 +62452,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 856.05859375, - "completions/mean_terminated_length": 797.4384765625, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1022.796875, + "completions/mean_terminated_length": 996.0881958007812, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, "epoch": 0.7353418110437825, - "grad_norm": 1.8320715427398682, - "kl": 6.828125, - "learning_rate": 2.789906088239419e-07, - "loss": 0.4784, - "num_tokens": 1166932583.0, - "reward": 1.82421875, - "reward_std": 0.5676283836364746, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.19872237741947174, + "grad_norm": 5.045741081237793, + "kl": 2.6796875, + "learning_rate": 2.791166171033301e-07, + "loss": 0.1201, + "num_tokens": 1210684168.0, + "reward": 1.08203125, + "reward_std": 0.34311410784721375, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.1834714710712433, "step": 2154 }, { @@ -62481,27 +62481,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 809.400390625, - "completions/mean_terminated_length": 761.665283203125, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 952.97265625, + "completions/mean_terminated_length": 926.6920166015625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, "epoch": 0.7356831953571733, - "grad_norm": 2.485974073410034, - "kl": 5.578125, - "learning_rate": 2.7856281942437635e-07, - "loss": 0.3566, - "num_tokens": 1167431924.0, - "reward": 1.87841796875, - "reward_std": 0.524060070514679, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.1981005072593689, + "grad_norm": 2.6554343700408936, + "kl": 2.30078125, + "learning_rate": 2.7868855214855465e-07, + "loss": 0.124, + "num_tokens": 1211257018.0, + "reward": 1.1103515625, + "reward_std": 0.3540228307247162, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.18193122744560242, "step": 2155 }, { @@ -62510,27 +62510,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 837.966796875, - "completions/mean_terminated_length": 801.4466552734375, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1030.09765625, + "completions/mean_terminated_length": 997.2620849609375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.7360245796705641, - "grad_norm": 4.13815975189209, - "kl": 4.4921875, - "learning_rate": 2.781354152802422e-07, - "loss": 0.327, - "num_tokens": 1167937155.0, - "reward": 1.86767578125, - "reward_std": 0.539473295211792, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.18408437073230743, + "grad_norm": 1.4620965719223022, + "kl": 1.677734375, + "learning_rate": 2.78260872562979e-07, + "loss": 0.0632, + "num_tokens": 1211860620.0, + "reward": 1.134765625, + "reward_std": 0.338411808013916, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.17476527392864227, "step": 2156 }, { @@ -62539,27 +62539,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 863.119140625, - "completions/mean_terminated_length": 819.9453735351562, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 980.134765625, + "completions/mean_terminated_length": 952.3146362304688, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.7363659639839549, - "grad_norm": 1.896857738494873, - "kl": 5.109375, - "learning_rate": 2.777083969981611e-07, - "loss": 0.3534, - "num_tokens": 1168448800.0, - "reward": 1.861328125, - "reward_std": 0.5006389617919922, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.1869385838508606, + "grad_norm": 1.5125021934509277, + "kl": 1.5859375, + "learning_rate": 2.778335789540767e-07, + "loss": 0.0747, + "num_tokens": 1212432177.0, + "reward": 1.046875, + "reward_std": 0.28343820571899414, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.16324250400066376, "step": 2157 }, { @@ -62568,27 +62568,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 872.470703125, - "completions/mean_terminated_length": 801.8902587890625, - "completions/min_length": 82.0, - "completions/min_terminated_length": 82.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1017.806640625, + "completions/mean_terminated_length": 990.9679565429688, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, "epoch": 0.7367073482973457, - "grad_norm": 2.221587896347046, - "kl": 7.5390625, - "learning_rate": 2.7728176518420786e-07, - "loss": 0.4785, - "num_tokens": 1168982625.0, - "reward": 1.77734375, - "reward_std": 0.5431791543960571, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.20497123897075653, + "grad_norm": 5.71192741394043, + "kl": 2.8359375, + "learning_rate": 2.774066719287729e-07, + "loss": 0.1898, + "num_tokens": 1213040414.0, + "reward": 1.03076171875, + "reward_std": 0.3434918224811554, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.19498902559280396, "step": 2158 }, { @@ -62597,27 +62597,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1928.0, - "completions/mean_length": 839.021484375, - "completions/mean_terminated_length": 800.0221557617188, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1013.958984375, + "completions/mean_terminated_length": 991.2554931640625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.7370487326107366, - "grad_norm": 1.8725794553756714, - "kl": 5.33203125, - "learning_rate": 2.768555204439079e-07, - "loss": 0.3407, - "num_tokens": 1169487356.0, - "reward": 1.8466796875, - "reward_std": 0.47082775831222534, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.17521031200885773, + "grad_norm": 2.491302013397217, + "kl": 2.712890625, + "learning_rate": 2.7698015209344404e-07, + "loss": 0.1539, + "num_tokens": 1213634713.0, + "reward": 1.03662109375, + "reward_std": 0.30933213233947754, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.18874886631965637, "step": 2159 }, { @@ -62626,27 +62626,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 801.978515625, - "completions/mean_terminated_length": 777.1574096679688, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 949.546875, + "completions/mean_terminated_length": 920.9298706054688, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, "epoch": 0.7373901169241274, - "grad_norm": 1.1445845365524292, - "kl": 4.90234375, - "learning_rate": 2.764296633822379e-07, - "loss": 0.3161, - "num_tokens": 1169976049.0, - "reward": 1.87548828125, - "reward_std": 0.46409809589385986, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94384765625, - "rewards/tag_count_reward/std": 0.1659528613090515, + "grad_norm": 4.889411926269531, + "kl": 2.072265625, + "learning_rate": 2.765540200539166e-07, + "loss": 0.1224, + "num_tokens": 1214198961.0, + "reward": 1.0322265625, + "reward_std": 0.3285452723503113, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.1812576949596405, "step": 2160 }, { @@ -62655,27 +62655,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 794.51171875, - "completions/mean_terminated_length": 772.0834350585938, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 961.775390625, + "completions/mean_terminated_length": 935.7060546875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, "epoch": 0.7377315012375182, - "grad_norm": 1.0854650735855103, - "kl": 4.8046875, - "learning_rate": 2.7600419460362416e-07, - "loss": 0.3206, - "num_tokens": 1170463351.0, - "reward": 1.92724609375, - "reward_std": 0.4725736081600189, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.16028828918933868, + "grad_norm": 1.7071599960327148, + "kl": 2.98828125, + "learning_rate": 2.7612827641546566e-07, + "loss": 0.1902, + "num_tokens": 1214771902.0, + "reward": 1.12548828125, + "reward_std": 0.36986303329467773, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.18521249294281006, "step": 2161 }, { @@ -62684,27 +62684,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1735.0, - "completions/mean_length": 761.931640625, - "completions/mean_terminated_length": 741.5178833007812, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 915.26953125, + "completions/mean_terminated_length": 881.0824584960938, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.7380728855509089, - "grad_norm": 1.5088930130004883, - "kl": 4.546875, - "learning_rate": 2.7557911471194167e-07, - "loss": 0.2663, - "num_tokens": 1170925444.0, - "reward": 1.90625, - "reward_std": 0.4686610996723175, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.947265625, - "rewards/tag_count_reward/std": 0.1659708470106125, + "grad_norm": 1.9646536111831665, + "kl": 2.873046875, + "learning_rate": 2.757029217828153e-07, + "loss": 0.1752, + "num_tokens": 1215312504.0, + "reward": 1.150390625, + "reward_std": 0.33948659896850586, + "rewards/accuracy_reward/mean": 0.173828125, + "rewards/accuracy_reward/std": 0.3793322443962097, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.16547498106956482, "step": 2162 }, { @@ -62713,27 +62713,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1983.0, - "completions/mean_length": 815.39453125, - "completions/mean_terminated_length": 770.4818115234375, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 972.220703125, + "completions/mean_terminated_length": 926.2098388671875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, "epoch": 0.7384142698642997, - "grad_norm": 2.3725340366363525, - "kl": 8.6875, - "learning_rate": 2.7515442431051363e-07, - "loss": 0.5359, - "num_tokens": 1171425982.0, - "reward": 1.82861328125, - "reward_std": 0.5628782510757446, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.20722854137420654, + "grad_norm": 2.8977572917938232, + "kl": 3.1875, + "learning_rate": 2.7527795676013654e-07, + "loss": 0.1712, + "num_tokens": 1215893337.0, + "reward": 1.05810546875, + "reward_std": 0.32645851373672485, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.20343582332134247, "step": 2163 }, { @@ -62742,27 +62742,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1950.0, - "completions/mean_length": 874.740234375, - "completions/mean_terminated_length": 831.9899291992188, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1021.349609375, + "completions/mean_terminated_length": 1002.9801025390625, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, "epoch": 0.7387556541776905, - "grad_norm": 2.1587789058685303, - "kl": 7.765625, - "learning_rate": 2.747301240021101e-07, - "loss": 0.4554, - "num_tokens": 1171950025.0, - "reward": 1.8076171875, - "reward_std": 0.5351447463035583, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.1968260258436203, + "grad_norm": 4.004371643066406, + "kl": 2.318359375, + "learning_rate": 2.7485338195104736e-07, + "loss": 0.1268, + "num_tokens": 1216492444.0, + "reward": 1.0498046875, + "reward_std": 0.3321082890033722, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.18657785654067993, "step": 2164 }, { @@ -62771,27 +62771,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1902.0, - "completions/mean_length": 862.8671875, - "completions/mean_terminated_length": 822.1657104492188, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1013.984375, + "completions/mean_terminated_length": 982.776611328125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.7390970384910813, - "grad_norm": 2.766791582107544, - "kl": 6.2890625, - "learning_rate": 2.7430621438894816e-07, - "loss": 0.3801, - "num_tokens": 1172459973.0, - "reward": 1.8203125, - "reward_std": 0.45686471462249756, - "rewards/accuracy_reward/mean": 0.017578125, - "rewards/accuracy_reward/std": 0.13154059648513794, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.1816815733909607, + "grad_norm": 2.154346227645874, + "kl": 2.015625, + "learning_rate": 2.744291979586112e-07, + "loss": 0.1035, + "num_tokens": 1217079764.0, + "reward": 1.0634765625, + "reward_std": 0.31781646609306335, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.16808471083641052, "step": 2165 }, { @@ -62800,27 +62800,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 846.95703125, - "completions/mean_terminated_length": 810.7081909179688, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1020.060546875, + "completions/mean_terminated_length": 1003.7440795898438, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, "epoch": 0.7394384228044721, - "grad_norm": 2.988097667694092, - "kl": 6.1171875, - "learning_rate": 2.7388269607268967e-07, - "loss": 0.3347, - "num_tokens": 1172974895.0, - "reward": 1.833984375, - "reward_std": 0.4791252613067627, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.19018182158470154, + "grad_norm": 1.7318470478057861, + "kl": 2.091796875, + "learning_rate": 2.740054053853369e-07, + "loss": 0.1006, + "num_tokens": 1217683315.0, + "reward": 1.0322265625, + "reward_std": 0.32956641912460327, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.17860238254070282, "step": 2166 }, { @@ -62829,27 +62829,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 817.06640625, - "completions/mean_terminated_length": 761.7999877929688, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 996.560546875, + "completions/mean_terminated_length": 956.0385131835938, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, "epoch": 0.739779807117863, - "grad_norm": 1.7172340154647827, - "kl": 7.24609375, - "learning_rate": 2.734595696544416e-07, - "loss": 0.4631, - "num_tokens": 1173469329.0, - "reward": 1.81787109375, - "reward_std": 0.5089281797409058, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.20189879834651947, + "grad_norm": 1.7835890054702759, + "kl": 1.919921875, + "learning_rate": 2.735820048331765e-07, + "loss": 0.1062, + "num_tokens": 1218269650.0, + "reward": 1.02734375, + "reward_std": 0.278891384601593, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.15948821604251862, "step": 2167 }, { @@ -62858,27 +62858,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 804.94140625, - "completions/mean_terminated_length": 772.55712890625, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1009.095703125, + "completions/mean_terminated_length": 973.4161987304688, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, "epoch": 0.7401211914312538, - "grad_norm": 0.8305309414863586, - "kl": 5.4453125, - "learning_rate": 2.730368357347548e-07, - "loss": 0.3683, - "num_tokens": 1173968115.0, - "reward": 1.89111328125, - "reward_std": 0.5046444535255432, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.93798828125, - "rewards/tag_count_reward/std": 0.17257477343082428, + "grad_norm": 3.8984973430633545, + "kl": 2.306640625, + "learning_rate": 2.731589969035261e-07, + "loss": 0.1386, + "num_tokens": 1218872963.0, + "reward": 1.09716796875, + "reward_std": 0.3077167272567749, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.18708612024784088, "step": 2168 }, { @@ -62887,27 +62887,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1933.0, - "completions/mean_length": 857.11328125, - "completions/mean_terminated_length": 798.5450439453125, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1044.5, + "completions/mean_terminated_length": 1003.707275390625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.7404625757446446, - "grad_norm": 1.792198896408081, - "kl": 6.65625, - "learning_rate": 2.7261449491362197e-07, - "loss": 0.4595, - "num_tokens": 1174481533.0, - "reward": 1.849609375, - "reward_std": 0.49879589676856995, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.19342006742954254, + "grad_norm": 4.998632431030273, + "kl": 2.759765625, + "learning_rate": 2.7273638219722315e-07, + "loss": 0.1671, + "num_tokens": 1219482323.0, + "reward": 1.0419921875, + "reward_std": 0.36220264434814453, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.21428602933883667, "step": 2169 }, { @@ -62916,27 +62916,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1974.0, - "completions/mean_length": 832.763671875, - "completions/mean_terminated_length": 783.36376953125, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1022.419921875, + "completions/mean_terminated_length": 1001.9900512695312, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, "epoch": 0.7408039600580353, - "grad_norm": 1.9688204526901245, - "kl": 4.9375, - "learning_rate": 2.721925477904794e-07, - "loss": 0.3188, - "num_tokens": 1174988260.0, - "reward": 1.88671875, - "reward_std": 0.5032538175582886, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.18636520206928253, + "grad_norm": 1.772864580154419, + "kl": 3.1015625, + "learning_rate": 2.723141613145476e-07, + "loss": 0.1553, + "num_tokens": 1220086154.0, + "reward": 1.09619140625, + "reward_std": 0.37537693977355957, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.90869140625, + "rewards/tag_count_reward/std": 0.2106221467256546, "step": 2170 }, { @@ -62945,27 +62945,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 881.09765625, - "completions/mean_terminated_length": 818.6707763671875, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1065.98828125, + "completions/mean_terminated_length": 1011.3196411132812, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.7411453443714261, - "grad_norm": 2.3532350063323975, - "kl": 6.265625, - "learning_rate": 2.717709949642034e-07, - "loss": 0.4217, - "num_tokens": 1175514406.0, - "reward": 1.8876953125, - "reward_std": 0.5550177097320557, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.20141369104385376, + "grad_norm": 1.623914361000061, + "kl": 2.4765625, + "learning_rate": 2.7189233485521934e-07, + "loss": 0.1174, + "num_tokens": 1220706964.0, + "reward": 1.10546875, + "reward_std": 0.3655689060688019, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.19305415451526642, "step": 2171 }, { @@ -62974,27 +62974,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 792.400390625, - "completions/mean_terminated_length": 754.5050048828125, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 947.958984375, + "completions/mean_terminated_length": 928.2763061523438, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.7414867286848169, - "grad_norm": 2.590756893157959, - "kl": 5.26171875, - "learning_rate": 2.7134983703311136e-07, - "loss": 0.3631, - "num_tokens": 1175990595.0, - "reward": 1.91357421875, - "reward_std": 0.4403735101222992, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.91015625, - "rewards/format_reward/std": 0.2862374484539032, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.17971375584602356, + "grad_norm": 3.8861334323883057, + "kl": 2.546875, + "learning_rate": 2.714709034183984e-07, + "loss": 0.1417, + "num_tokens": 1221262799.0, + "reward": 1.037109375, + "reward_std": 0.3145906627178192, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.18861782550811768, "step": 2172 }, { @@ -63003,27 +63003,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 853.4296875, - "completions/mean_terminated_length": 809.90283203125, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 967.7421875, + "completions/mean_terminated_length": 950.5952758789062, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, "epoch": 0.7418281129982077, - "grad_norm": 1.6033027172088623, - "kl": 6.078125, - "learning_rate": 2.7092907459495973e-07, - "loss": 0.4233, - "num_tokens": 1176505327.0, - "reward": 1.8564453125, - "reward_std": 0.5025766491889954, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.18859504163265228, + "grad_norm": 1.6836764812469482, + "kl": 2.0234375, + "learning_rate": 2.7104986760268324e-07, + "loss": 0.1047, + "num_tokens": 1221836059.0, + "reward": 1.08349609375, + "reward_std": 0.3300013244152069, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.18665657937526703, "step": 2173 }, { @@ -63032,27 +63032,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 832.955078125, - "completions/mean_terminated_length": 791.226318359375, - "completions/min_length": 216.0, - "completions/min_terminated_length": 216.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1038.064453125, + "completions/mean_terminated_length": 988.3954467773438, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.7421694973115985, - "grad_norm": 0.7020198106765747, - "kl": 7.4375, - "learning_rate": 2.7050870824694407e-07, - "loss": 0.4757, - "num_tokens": 1177006648.0, - "reward": 1.8623046875, - "reward_std": 0.5373298525810242, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9287109375, - "rewards/tag_count_reward/std": 0.19546197354793549, + "grad_norm": 2.9049177169799805, + "kl": 3.1953125, + "learning_rate": 2.70629228006111e-07, + "loss": 0.1706, + "num_tokens": 1222442396.0, + "reward": 1.05029296875, + "reward_std": 0.35290706157684326, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.20348748564720154, "step": 2174 }, { @@ -63061,27 +63061,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1935.0, - "completions/mean_length": 869.595703125, - "completions/mean_terminated_length": 829.1253051757812, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1060.767578125, + "completions/mean_terminated_length": 1043.1033935546875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, "epoch": 0.7425108816249893, - "grad_norm": 1.6543843746185303, - "kl": 6.453125, - "learning_rate": 2.700887385856974e-07, - "loss": 0.395, - "num_tokens": 1177535497.0, - "reward": 1.8525390625, - "reward_std": 0.5069622993469238, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.18729348480701447, + "grad_norm": 3.161365032196045, + "kl": 1.9296875, + "learning_rate": 2.702089852261553e-07, + "loss": 0.0691, + "num_tokens": 1223069125.0, + "reward": 1.0400390625, + "reward_std": 0.3222488760948181, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.1872730702161789, "step": 2175 }, { @@ -63090,27 +63090,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 857.248046875, - "completions/mean_terminated_length": 826.2264404296875, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1046.0546875, + "completions/mean_terminated_length": 1013.7338256835938, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, "epoch": 0.7428522659383802, - "grad_norm": 0.9050835967063904, - "kl": 5.4453125, - "learning_rate": 2.6966916620728966e-07, - "loss": 0.3058, - "num_tokens": 1178053832.0, - "reward": 1.85205078125, - "reward_std": 0.500805139541626, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.18154938519001007, + "grad_norm": 2.33664608001709, + "kl": 2.275390625, + "learning_rate": 2.6978913985972683e-07, + "loss": 0.1185, + "num_tokens": 1223684129.0, + "reward": 1.06103515625, + "reward_std": 0.3221091628074646, + "rewards/accuracy_reward/mean": 0.07056451588869095, + "rewards/accuracy_reward/std": 0.25635457038879395, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.1779128909111023, "step": 2176 }, { @@ -63119,27 +63119,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 887.3125, - "completions/mean_terminated_length": 840.1300659179688, - "completions/min_length": 193.0, - "completions/min_terminated_length": 193.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1063.375, + "completions/mean_terminated_length": 1027.498046875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, "epoch": 0.743193650251771, - "grad_norm": 1.8727673292160034, - "kl": 7.3828125, - "learning_rate": 2.6924999170722743e-07, - "loss": 0.4327, - "num_tokens": 1178597032.0, - "reward": 1.80029296875, - "reward_std": 0.4992356300354004, - "rewards/accuracy_reward/mean": 0.021484375, - "rewards/accuracy_reward/std": 0.14513419568538666, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.20024023950099945, + "grad_norm": 7.468886375427246, + "kl": 2.8984375, + "learning_rate": 2.6936969250317154e-07, + "loss": 0.1142, + "num_tokens": 1224317473.0, + "reward": 1.02294921875, + "reward_std": 0.35908567905426025, + "rewards/accuracy_reward/mean": 0.029296875, + "rewards/accuracy_reward/std": 0.16880230605602264, + "rewards/format_reward/mean": 0.083984375, + "rewards/format_reward/std": 0.2776356339454651, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.20516635477542877, "step": 2177 }, { @@ -63148,27 +63148,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 865.021484375, - "completions/mean_terminated_length": 791.3921508789062, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1028.158203125, + "completions/mean_terminated_length": 997.3782348632812, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, "epoch": 0.7435350345651617, - "grad_norm": 2.3138346672058105, - "kl": 8.40625, - "learning_rate": 2.6883121568045197e-07, - "loss": 0.5271, - "num_tokens": 1179114051.0, - "reward": 1.81591796875, - "reward_std": 0.5568733811378479, - "rewards/accuracy_reward/mean": 0.04435483738780022, - "rewards/accuracy_reward/std": 0.2060900777578354, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.22002561390399933, + "grad_norm": 1.6769273281097412, + "kl": 2.931640625, + "learning_rate": 2.6895064375227e-07, + "loss": 0.1552, + "num_tokens": 1224918018.0, + "reward": 1.044921875, + "reward_std": 0.3472713232040405, + "rewards/accuracy_reward/mean": 0.08870967477560043, + "rewards/accuracy_reward/std": 0.2846112847328186, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.20437365770339966, "step": 2178 }, { @@ -63177,27 +63177,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 855.201171875, - "completions/mean_terminated_length": 814.2363891601562, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1043.041015625, + "completions/mean_terminated_length": 1016.8597412109375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, "epoch": 0.7438764188785525, - "grad_norm": 3.3663816452026367, - "kl": 7.8125, - "learning_rate": 2.6841283872133954e-07, - "loss": 0.4395, - "num_tokens": 1179637770.0, - "reward": 1.80810546875, - "reward_std": 0.5312970280647278, - "rewards/accuracy_reward/mean": 0.058467742055654526, - "rewards/accuracy_reward/std": 0.23486268520355225, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.20005404949188232, + "grad_norm": 3.5246975421905518, + "kl": 2.48828125, + "learning_rate": 2.685319942022364e-07, + "loss": 0.1505, + "num_tokens": 1225537911.0, + "reward": 1.0390625, + "reward_std": 0.33686167001724243, + "rewards/accuracy_reward/mean": 0.060483869165182114, + "rewards/accuracy_reward/std": 0.2386218160390854, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.2005603015422821, "step": 2179 }, { @@ -63206,27 +63206,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 879.1328125, - "completions/mean_terminated_length": 829.1405639648438, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1033.140625, + "completions/mean_terminated_length": 1019.0733032226562, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.7442178031919433, - "grad_norm": 2.449056625366211, - "kl": 8.90625, - "learning_rate": 2.6799486142369955e-07, - "loss": 0.5346, - "num_tokens": 1180159054.0, - "reward": 1.79052734375, - "reward_std": 0.5429601073265076, - "rewards/accuracy_reward/mean": 0.0234375, - "rewards/accuracy_reward/std": 0.15143637359142303, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.2030879557132721, + "grad_norm": 1.865007996559143, + "kl": 2.138671875, + "learning_rate": 2.6811374444771833e-07, + "loss": 0.0769, + "num_tokens": 1226138047.0, + "reward": 1.04443359375, + "reward_std": 0.34877651929855347, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.19118840992450714, "step": 2180 }, { @@ -63235,27 +63235,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 843.326171875, - "completions/mean_terminated_length": 809.4598388671875, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 997.76953125, + "completions/mean_terminated_length": 972.5640258789062, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, "epoch": 0.7445591875053341, - "grad_norm": 1.237591028213501, - "kl": 6.640625, - "learning_rate": 2.6757728438077414e-07, - "loss": 0.3992, - "num_tokens": 1180681429.0, - "reward": 1.8359375, - "reward_std": 0.5391745567321777, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.2015109360218048, + "grad_norm": 1.785409688949585, + "kl": 2.73828125, + "learning_rate": 2.676958950827952e-07, + "loss": 0.1224, + "num_tokens": 1226739497.0, + "reward": 1.05908203125, + "reward_std": 0.369469553232193, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.1946849673986435, "step": 2181 }, { @@ -63264,27 +63264,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 863.49609375, - "completions/mean_terminated_length": 807.783203125, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1017.2578125, + "completions/mean_terminated_length": 979.700439453125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, "epoch": 0.7449005718187249, - "grad_norm": 1.4139468669891357, - "kl": 7.6953125, - "learning_rate": 2.6716010818523794e-07, - "loss": 0.4984, - "num_tokens": 1181201299.0, - "reward": 1.859375, - "reward_std": 0.5659919381141663, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.20674438774585724, + "grad_norm": 3.1059558391571045, + "kl": 2.62109375, + "learning_rate": 2.6727844670097776e-07, + "loss": 0.1522, + "num_tokens": 1227338093.0, + "reward": 1.09423828125, + "reward_std": 0.3844451308250427, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.18998514115810394, "step": 2182 }, { @@ -63293,27 +63293,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.06640625, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1970.0, - "completions/mean_length": 831.076171875, - "completions/mean_terminated_length": 744.5167236328125, - "completions/min_length": 193.0, - "completions/min_terminated_length": 193.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 965.603515625, + "completions/mean_terminated_length": 952.768798828125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, "epoch": 0.7452419561321157, - "grad_norm": 0.7848081588745117, - "kl": 9.234375, - "learning_rate": 2.667433334291958e-07, - "loss": 0.6095, - "num_tokens": 1181699674.0, - "reward": 1.82080078125, - "reward_std": 0.5908867120742798, - "rewards/accuracy_reward/mean": 0.08669354766607285, - "rewards/accuracy_reward/std": 0.281669557094574, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.22056347131729126, + "grad_norm": 1.2423095703125, + "kl": 2.2421875, + "learning_rate": 2.668613998952074e-07, + "loss": 0.1251, + "num_tokens": 1227905346.0, + "reward": 1.09423828125, + "reward_std": 0.3628828525543213, + "rewards/accuracy_reward/mean": 0.13709677755832672, + "rewards/accuracy_reward/std": 0.34429675340652466, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.2010640949010849, "step": 2183 }, { @@ -63322,27 +63322,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 842.107421875, - "completions/mean_terminated_length": 780.2033081054688, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 980.681640625, + "completions/mean_terminated_length": 946.2520141601562, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.7455833404455066, - "grad_norm": 1.3622740507125854, - "kl": 6.9140625, - "learning_rate": 2.663269607041837e-07, - "loss": 0.4162, - "num_tokens": 1182215665.0, - "reward": 1.84716796875, - "reward_std": 0.5520458817481995, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.2068500965833664, + "grad_norm": 1.7096635103225708, + "kl": 3.12890625, + "learning_rate": 2.6644475525785497e-07, + "loss": 0.1906, + "num_tokens": 1228492287.0, + "reward": 1.0927734375, + "reward_std": 0.36463260650634766, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.19994714856147766, "step": 2184 }, { @@ -63351,27 +63351,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 892.083984375, - "completions/mean_terminated_length": 812.4488525390625, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1024.8515625, + "completions/mean_terminated_length": 998.1964111328125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, "epoch": 0.7459247247588974, - "grad_norm": 1.429387092590332, - "kl": 8.0546875, - "learning_rate": 2.6591099060116625e-07, - "loss": 0.5478, - "num_tokens": 1182745356.0, - "reward": 1.8076171875, - "reward_std": 0.5870583653450012, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.21045252680778503, + "grad_norm": 2.4175004959106445, + "kl": 2.251953125, + "learning_rate": 2.6602851338072e-07, + "loss": 0.0955, + "num_tokens": 1229089955.0, + "reward": 1.0869140625, + "reward_std": 0.3920617997646332, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.9189453125, + "rewards/tag_count_reward/std": 0.19414739310741425, "step": 2185 }, { @@ -63380,27 +63380,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 832.154296875, - "completions/mean_terminated_length": 769.7392578125, - "completions/min_length": 93.0, - "completions/min_terminated_length": 93.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 984.775390625, + "completions/mean_terminated_length": 967.8988647460938, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.7462661090722881, - "grad_norm": 2.683655261993408, - "kl": 7.5390625, - "learning_rate": 2.6549542371053714e-07, - "loss": 0.5169, - "num_tokens": 1183252475.0, - "reward": 1.7998046875, - "reward_std": 0.5724963545799255, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.21008896827697754, + "grad_norm": 2.8256916999816895, + "kl": 2.560546875, + "learning_rate": 2.656126748550301e-07, + "loss": 0.1382, + "num_tokens": 1229675216.0, + "reward": 1.07275390625, + "reward_std": 0.35024622082710266, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.17113468050956726, "step": 2186 }, { @@ -63411,25 +63411,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1906.0, - "completions/mean_length": 836.283203125, - "completions/mean_terminated_length": 794.668701171875, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 963.8125, + "completions/mean_terminated_length": 926.5778198242188, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.7466074933856789, - "grad_norm": 1.7893295288085938, - "kl": 5.5703125, - "learning_rate": 2.650802606221175e-07, - "loss": 0.361, - "num_tokens": 1183757644.0, - "reward": 1.94970703125, - "reward_std": 0.5130065679550171, - "rewards/accuracy_reward/mean": 0.13671875, - "rewards/accuracy_reward/std": 0.3438861668109894, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.18751464784145355, + "grad_norm": 2.071110725402832, + "kl": 2.810546875, + "learning_rate": 2.6519724027143977e-07, + "loss": 0.1593, + "num_tokens": 1230245680.0, + "reward": 1.1650390625, + "reward_std": 0.42103928327560425, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.3937928080558777, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.19645671546459198, "step": 2187 }, { @@ -63438,27 +63438,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 855.548828125, - "completions/mean_terminated_length": 809.59228515625, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 999.759765625, + "completions/mean_terminated_length": 981.00390625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, "epoch": 0.7469488776990697, - "grad_norm": 1.3820817470550537, - "kl": 6.125, - "learning_rate": 2.6466550192515526e-07, - "loss": 0.4133, - "num_tokens": 1184272053.0, - "reward": 1.87451171875, - "reward_std": 0.5345189571380615, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.18454577028751373, + "grad_norm": 3.4647161960601807, + "kl": 2.11328125, + "learning_rate": 2.6478221022002987e-07, + "loss": 0.1177, + "num_tokens": 1230833925.0, + "reward": 1.09765625, + "reward_std": 0.3251683712005615, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.18033012747764587, "step": 2188 }, { @@ -63467,27 +63467,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 803.23828125, - "completions/mean_terminated_length": 773.364013671875, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 1921.0, + "completions/mean_length": 970.755859375, + "completions/mean_terminated_length": 957.9822387695312, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.7472902620124605, - "grad_norm": 1.103055715560913, - "kl": 5.5546875, - "learning_rate": 2.642511482083247e-07, - "loss": 0.3329, - "num_tokens": 1184758175.0, - "reward": 1.8681640625, - "reward_std": 0.5132533311843872, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.1847042590379715, + "grad_norm": 1.144181489944458, + "kl": 1.669921875, + "learning_rate": 2.643675852903069e-07, + "loss": 0.0937, + "num_tokens": 1231405816.0, + "reward": 1.099609375, + "reward_std": 0.34585076570510864, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.939453125, + "rewards/tag_count_reward/std": 0.16624696552753448, "step": 2189 }, { @@ -63496,27 +63496,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 839.50390625, - "completions/mean_terminated_length": 792.9290161132812, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 960.53125, + "completions/mean_terminated_length": 938.8685302734375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, "epoch": 0.7476316463258513, - "grad_norm": 1.225283145904541, - "kl": 5.71875, - "learning_rate": 2.638372000597251e-07, - "loss": 0.3729, - "num_tokens": 1185262065.0, - "reward": 1.833984375, - "reward_std": 0.5245562195777893, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.20272120833396912, + "grad_norm": 2.9684247970581055, + "kl": 1.79296875, + "learning_rate": 2.6395336607120155e-07, + "loss": 0.0952, + "num_tokens": 1231971672.0, + "reward": 1.08056640625, + "reward_std": 0.3182827830314636, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.16826297342777252, "step": 2190 }, { @@ -63525,27 +63525,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1937.0, - "completions/mean_length": 891.169921875, - "completions/mean_terminated_length": 834.276611328125, - "completions/min_length": 72.0, - "completions/min_terminated_length": 72.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 989.8125, + "completions/mean_terminated_length": 962.2445068359375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, "epoch": 0.7479730306392421, - "grad_norm": 1.0358110666275024, - "kl": 6.359375, - "learning_rate": 2.634236580668802e-07, - "loss": 0.3795, - "num_tokens": 1185798568.0, - "reward": 1.82421875, - "reward_std": 0.5024953484535217, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.199825257062912, + "grad_norm": 4.261856555938721, + "kl": 2.203125, + "learning_rate": 2.635395531510683e-07, + "loss": 0.0999, + "num_tokens": 1232558680.0, + "reward": 1.0302734375, + "reward_std": 0.32941746711730957, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.1867826133966446, "step": 2191 }, { @@ -63554,27 +63554,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 894.794921875, - "completions/mean_terminated_length": 838.0798950195312, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1070.248046875, + "completions/mean_terminated_length": 1036.668701171875, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, "epoch": 0.748314414952633, - "grad_norm": 1.1366462707519531, - "kl": 6.828125, - "learning_rate": 2.630105228167369e-07, - "loss": 0.4337, - "num_tokens": 1186336559.0, - "reward": 1.8154296875, - "reward_std": 0.5645444989204407, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.19944952428340912, + "grad_norm": 4.148563385009766, + "kl": 2.046875, + "learning_rate": 2.6312614711768475e-07, + "loss": 0.1063, + "num_tokens": 1233186503.0, + "reward": 1.05126953125, + "reward_std": 0.34227079153060913, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.19215556979179382, "step": 2192 }, { @@ -63583,27 +63583,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 816.83203125, - "completions/mean_terminated_length": 766.7845458984375, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 986.458984375, + "completions/mean_terminated_length": 971.7445678710938, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, "epoch": 0.7486557992660238, - "grad_norm": 1.255671739578247, - "kl": 6.5625, - "learning_rate": 2.625977948956656e-07, - "loss": 0.4054, - "num_tokens": 1186822057.0, - "reward": 1.81591796875, - "reward_std": 0.49111637473106384, - "rewards/accuracy_reward/mean": 0.0234375, - "rewards/accuracy_reward/std": 0.15143637359142303, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19143815338611603, + "grad_norm": 9.642166137695312, + "kl": 2.00390625, + "learning_rate": 2.6271314855825034e-07, + "loss": 0.0316, + "num_tokens": 1233758850.0, + "reward": 1.0771484375, + "reward_std": 0.33060088753700256, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.9462890625, + "rewards/tag_count_reward/std": 0.15178616344928741, "step": 2193 }, { @@ -63612,27 +63612,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 828.25, - "completions/mean_terminated_length": 796.4729614257812, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 997.6015625, + "completions/mean_terminated_length": 989.3306884765625, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, "epoch": 0.7489971835794145, - "grad_norm": 1.0259623527526855, - "kl": 6.5234375, - "learning_rate": 2.621854748894578e-07, - "loss": 0.4083, - "num_tokens": 1187323561.0, - "reward": 1.83642578125, - "reward_std": 0.4819796085357666, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.1794423907995224, + "grad_norm": 2.298248052597046, + "kl": 2.50390625, + "learning_rate": 2.6230055805938577e-07, + "loss": 0.1084, + "num_tokens": 1234347062.0, + "reward": 1.068359375, + "reward_std": 0.34794530272483826, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.19246920943260193, "step": 2194 }, { @@ -63643,25 +63643,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 761.302734375, - "completions/mean_terminated_length": 722.4688110351562, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 927.787109375, + "completions/mean_terminated_length": 893.9778442382812, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.7493385678928053, - "grad_norm": 1.4665205478668213, - "kl": 5.9921875, - "learning_rate": 2.6177356338332635e-07, - "loss": 0.3828, - "num_tokens": 1187795764.0, - "reward": 1.92724609375, - "reward_std": 0.5011767148971558, - "rewards/accuracy_reward/mean": 0.10282257944345474, - "rewards/accuracy_reward/std": 0.30403366684913635, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.94287109375, - "rewards/tag_count_reward/std": 0.17284587025642395, + "grad_norm": 4.533019542694092, + "kl": 3.75390625, + "learning_rate": 2.6188837620713223e-07, + "loss": 0.1869, + "num_tokens": 1234904505.0, + "reward": 1.11962890625, + "reward_std": 0.37996405363082886, + "rewards/accuracy_reward/mean": 0.15120968222618103, + "rewards/accuracy_reward/std": 0.35861483216285706, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.91455078125, + "rewards/tag_count_reward/std": 0.20247536897659302, "step": 2195 }, { @@ -63670,27 +63670,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, - "completions/mean_length": 839.35546875, - "completions/mean_terminated_length": 805.3775024414062, - "completions/min_length": 68.0, - "completions/min_terminated_length": 68.0, + "completions/mean_length": 1005.27734375, + "completions/mean_terminated_length": 971.64111328125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, "epoch": 0.7496799522061961, - "grad_norm": 2.3356518745422363, - "kl": 6.1640625, - "learning_rate": 2.6136206096190445e-07, - "loss": 0.385, - "num_tokens": 1188306682.0, - "reward": 1.833984375, - "reward_std": 0.47078582644462585, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.19572821259498596, + "grad_norm": 1.977700114250183, + "kl": 2.861328125, + "learning_rate": 2.6147660358695063e-07, + "loss": 0.1633, + "num_tokens": 1235500375.0, + "reward": 1.0498046875, + "reward_std": 0.30382025241851807, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.18327085673809052, "step": 2196 }, { @@ -63699,27 +63699,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, - "completions/mean_length": 782.974609375, - "completions/mean_terminated_length": 731.55078125, - "completions/min_length": 182.0, - "completions/min_terminated_length": 182.0, + "completions/mean_length": 918.67578125, + "completions/mean_terminated_length": 893.8802490234375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.7500213365195869, - "grad_norm": 2.1885030269622803, - "kl": 6.37890625, - "learning_rate": 2.609509682092442e-07, - "loss": 0.3773, - "num_tokens": 1188789133.0, - "reward": 1.88134765625, - "reward_std": 0.5129489302635193, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.1834864467382431, + "grad_norm": 3.8987481594085693, + "kl": 1.92578125, + "learning_rate": 2.610652407837201e-07, + "loss": 0.0727, + "num_tokens": 1236052305.0, + "reward": 1.18212890625, + "reward_std": 0.37431368231773376, + "rewards/accuracy_reward/mean": 0.169921875, + "rewards/accuracy_reward/std": 0.3759314715862274, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.94189453125, + "rewards/tag_count_reward/std": 0.1600138396024704, "step": 2197 }, { @@ -63728,27 +63728,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 819.435546875, - "completions/mean_terminated_length": 787.4288940429688, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 984.701171875, + "completions/mean_terminated_length": 959.1820678710938, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.7503627208329777, - "grad_norm": 1.824910044670105, - "kl": 5.765625, - "learning_rate": 2.6054028570881697e-07, - "loss": 0.3489, - "num_tokens": 1189283196.0, - "reward": 1.91357421875, - "reward_std": 0.5337316393852234, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.1656993180513382, + "grad_norm": 5.456165790557861, + "kl": 3.130859375, + "learning_rate": 2.606542883817381e-07, + "loss": 0.1349, + "num_tokens": 1236630984.0, + "reward": 1.16943359375, + "reward_std": 0.3833501636981964, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.18361139297485352, "step": 2198 }, { @@ -63757,27 +63757,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 732.29296875, - "completions/mean_terminated_length": 714.0554809570312, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 1893.0, + "completions/mean_length": 870.19921875, + "completions/mean_terminated_length": 851.5040283203125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, "epoch": 0.7507041051463685, - "grad_norm": 1.5997239351272583, - "kl": 5.65625, - "learning_rate": 2.6013001404351133e-07, - "loss": 0.3435, - "num_tokens": 1189730274.0, - "reward": 1.86376953125, - "reward_std": 0.5020423531532288, - "rewards/accuracy_reward/mean": 0.058467742055654526, - "rewards/accuracy_reward/std": 0.23486268520355225, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93798828125, - "rewards/tag_count_reward/std": 0.1689939647912979, + "grad_norm": 4.165374755859375, + "kl": 2.171875, + "learning_rate": 2.602437469647189e-07, + "loss": 0.109, + "num_tokens": 1237148670.0, + "reward": 1.1533203125, + "reward_std": 0.366317480802536, + "rewards/accuracy_reward/mean": 0.14717741310596466, + "rewards/accuracy_reward/std": 0.3546403646469116, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.9482421875, + "rewards/tag_count_reward/std": 0.15720415115356445, "step": 2199 }, { @@ -63786,27 +63786,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 804.517578125, - "completions/mean_terminated_length": 761.8121337890625, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 994.494140625, + "completions/mean_terminated_length": 964.8775024414062, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, "epoch": 0.7510454894597594, - "grad_norm": 2.429795503616333, - "kl": 6.45703125, - "learning_rate": 2.5972015379563263e-07, - "loss": 0.417, - "num_tokens": 1190213915.0, - "reward": 1.86865234375, - "reward_std": 0.5221760272979736, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.18708612024784088, + "grad_norm": 4.184016704559326, + "kl": 2.66796875, + "learning_rate": 2.598336171157932e-07, + "loss": 0.1281, + "num_tokens": 1237729579.0, + "reward": 1.1171875, + "reward_std": 0.3676406741142273, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.935546875, + "rewards/tag_count_reward/std": 0.17203198373317719, "step": 2200 }, { @@ -63815,27 +63815,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 844.150390625, - "completions/mean_terminated_length": 815.258056640625, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1048.94921875, + "completions/mean_terminated_length": 1012.5465698242188, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.7513868737731502, - "grad_norm": 1.5156134366989136, - "kl": 4.828125, - "learning_rate": 2.5931070554690284e-07, - "loss": 0.2999, - "num_tokens": 1190722888.0, - "reward": 1.8701171875, - "reward_std": 0.49041420221328735, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.1779806911945343, + "grad_norm": 1.7787723541259766, + "kl": 2.58984375, + "learning_rate": 2.594238994175072e-07, + "loss": 0.1339, + "num_tokens": 1238343409.0, + "reward": 1.03076171875, + "reward_std": 0.34416288137435913, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.1923096626996994, "step": 2201 }, { @@ -63844,27 +63844,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 806.056640625, - "completions/mean_terminated_length": 768.5734252929688, - "completions/min_length": 61.0, - "completions/min_terminated_length": 61.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 985.39453125, + "completions/mean_terminated_length": 957.71142578125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.7517282580865409, - "grad_norm": 1.6273012161254883, - "kl": 4.71484375, - "learning_rate": 2.589016698784585e-07, - "loss": 0.3303, - "num_tokens": 1191204517.0, - "reward": 1.841796875, - "reward_std": 0.5300570726394653, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, + "grad_norm": 1.2324093580245972, + "kl": 1.77734375, + "learning_rate": 2.590145944518215e-07, + "loss": 0.0648, + "num_tokens": 1238916859.0, + "reward": 1.078125, + "reward_std": 0.3484116196632385, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.1808803677558899, + "rewards/tag_count_reward/std": 0.1767766922712326, "step": 2202 }, { @@ -63873,27 +63873,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 819.947265625, - "completions/mean_terminated_length": 785.4236450195312, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 973.837890625, + "completions/mean_terminated_length": 948.0580444335938, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.7520696423999317, - "grad_norm": 1.6526873111724854, - "kl": 4.7734375, - "learning_rate": 2.5849304737085143e-07, - "loss": 0.3211, - "num_tokens": 1191696170.0, - "reward": 1.91015625, - "reward_std": 0.5129385590553284, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.16988569498062134, + "grad_norm": 3.588472366333008, + "kl": 1.466796875, + "learning_rate": 2.5860570280011027e-07, + "loss": 0.0903, + "num_tokens": 1239487304.0, + "reward": 1.1357421875, + "reward_std": 0.33816277980804443, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.16213536262512207, "step": 2203 }, { @@ -63902,27 +63902,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 885.30078125, - "completions/mean_terminated_length": 818.0371704101562, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1099.107421875, + "completions/mean_terminated_length": 1072.431640625, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, "epoch": 0.7524110267133225, - "grad_norm": 2.0095741748809814, - "kl": 6.8984375, - "learning_rate": 2.5808483860404605e-07, - "loss": 0.4544, - "num_tokens": 1192231028.0, - "reward": 1.80322265625, - "reward_std": 0.594826877117157, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.20380185544490814, + "grad_norm": 2.380033016204834, + "kl": 2.12109375, + "learning_rate": 2.581972250431611e-07, + "loss": 0.0921, + "num_tokens": 1240131631.0, + "reward": 1.10595703125, + "reward_std": 0.34697479009628296, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.18606694042682648, "step": 2204 }, { @@ -63933,25 +63933,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1882.0, - "completions/mean_length": 828.533203125, - "completions/mean_terminated_length": 776.3768310546875, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1056.634765625, + "completions/mean_terminated_length": 1014.2342529296875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.7527524110267133, - "grad_norm": 1.1258745193481445, - "kl": 5.2890625, - "learning_rate": 2.576770441574204e-07, - "loss": 0.3411, - "num_tokens": 1192729221.0, - "reward": 1.7998046875, - "reward_std": 0.5032345056533813, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.17890173196792603, + "grad_norm": 4.082900047302246, + "kl": 2.115234375, + "learning_rate": 2.5778916176117314e-07, + "loss": 0.0945, + "num_tokens": 1240746612.0, + "reward": 1.00830078125, + "reward_std": 0.3190525770187378, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.91455078125, + "rewards/tag_count_reward/std": 0.1994321495294571, "step": 2205 }, { @@ -63960,27 +63960,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1961.0, - "completions/mean_length": 787.1328125, - "completions/mean_terminated_length": 746.4596557617188, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1007.197265625, + "completions/mean_terminated_length": 984.3453369140625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.7530937953401041, - "grad_norm": 1.8775099515914917, - "kl": 5.5546875, - "learning_rate": 2.5726966460976406e-07, - "loss": 0.3404, - "num_tokens": 1193210649.0, - "reward": 1.8466796875, - "reward_std": 0.5258615016937256, - "rewards/accuracy_reward/mean": 0.07459677755832672, - "rewards/accuracy_reward/std": 0.263004869222641, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9287109375, - "rewards/tag_count_reward/std": 0.17707644402980804, + "grad_norm": 1.7731800079345703, + "kl": 2.7578125, + "learning_rate": 2.5738151353375736e-07, + "loss": 0.1459, + "num_tokens": 1241340713.0, + "reward": 1.10400390625, + "reward_std": 0.37155601382255554, + "rewards/accuracy_reward/mean": 0.13508065044879913, + "rewards/accuracy_reward/std": 0.34215477108955383, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.18652856349945068, "step": 2206 }, { @@ -63989,27 +63989,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 878.25, - "completions/mean_terminated_length": 815.6707763671875, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1062.25390625, + "completions/mean_terminated_length": 1024.263671875, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, "epoch": 0.7534351796534949, - "grad_norm": 1.806650996208191, - "kl": 5.80859375, - "learning_rate": 2.5686270053927743e-07, - "loss": 0.3776, - "num_tokens": 1193738217.0, - "reward": 1.86474609375, - "reward_std": 0.592460572719574, + "grad_norm": 6.108999252319336, + "kl": 1.5625, + "learning_rate": 2.569742809399347e-07, + "loss": 0.0811, + "num_tokens": 1241962491.0, + "reward": 1.09814453125, + "reward_std": 0.3553093373775482, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.21251004934310913, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.18174928426742554, "step": 2207 }, { @@ -64018,27 +64018,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1911.0, - "completions/mean_length": 847.6328125, - "completions/mean_terminated_length": 816.3607177734375, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1079.341796875, + "completions/mean_terminated_length": 1048.0947265625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.7537765639668857, - "grad_norm": 1.2657173871994019, - "kl": 5.484375, - "learning_rate": 2.5645615252357205e-07, - "loss": 0.3376, - "num_tokens": 1194255517.0, - "reward": 1.8271484375, - "reward_std": 0.5507446527481079, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.17696848511695862, + "grad_norm": 1.553903579711914, + "kl": 2.0625, + "learning_rate": 2.5656746455813615e-07, + "loss": 0.0969, + "num_tokens": 1242598426.0, + "reward": 1.06787109375, + "reward_std": 0.35006827116012573, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.92919921875, + "rewards/tag_count_reward/std": 0.18102756142616272, "step": 2208 }, { @@ -64047,27 +64047,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 813.611328125, - "completions/mean_terminated_length": 752.9036254882812, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 986.189453125, + "completions/mean_terminated_length": 958.5270385742188, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, "epoch": 0.7541179482802766, - "grad_norm": 2.1323258876800537, - "kl": 6.0, - "learning_rate": 2.560500211396681e-07, - "loss": 0.3812, - "num_tokens": 1194754086.0, - "reward": 1.85009765625, - "reward_std": 0.5097396373748779, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.19460150599479675, + "grad_norm": 1.4526729583740234, + "kl": 1.90625, + "learning_rate": 2.5616106496620125e-07, + "loss": 0.0765, + "num_tokens": 1243185355.0, + "reward": 1.08154296875, + "reward_std": 0.3351963758468628, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.16863170266151428, "step": 2209 }, { @@ -64076,27 +64076,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1969.0, - "completions/mean_length": 844.94140625, - "completions/mean_terminated_length": 790.926513671875, - "completions/min_length": 84.0, - "completions/min_terminated_length": 84.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1029.755859375, + "completions/mean_terminated_length": 1007.399169921875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.7544593325936674, - "grad_norm": 1.9357986450195312, - "kl": 5.15234375, - "learning_rate": 2.556443069639951e-07, - "loss": 0.3005, - "num_tokens": 1195267832.0, - "reward": 1.83544921875, - "reward_std": 0.575616717338562, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.20040719211101532, + "grad_norm": 2.486086130142212, + "kl": 2.078125, + "learning_rate": 2.557550827413776e-07, + "loss": 0.1048, + "num_tokens": 1243793726.0, + "reward": 1.078125, + "reward_std": 0.3204424977302551, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.18677493929862976, "step": 2210 }, { @@ -64105,27 +64105,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 758.11328125, - "completions/mean_terminated_length": 735.0337524414062, - "completions/min_length": 114.0, - "completions/min_terminated_length": 114.0, + "completions/max_terminated_length": 1942.0, + "completions/mean_length": 936.416015625, + "completions/mean_terminated_length": 923.2352294921875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, "epoch": 0.7548007169070581, - "grad_norm": 1.115371584892273, - "kl": 4.3828125, - "learning_rate": 2.5523901057238994e-07, - "loss": 0.2774, - "num_tokens": 1195727586.0, - "reward": 1.84912109375, - "reward_std": 0.45485416054725647, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.18048836290836334, + "grad_norm": 2.824995756149292, + "kl": 1.58984375, + "learning_rate": 2.5534951846032e-07, + "loss": 0.078, + "num_tokens": 1244344771.0, + "reward": 1.05615234375, + "reward_std": 0.29490965604782104, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.17079374194145203, "step": 2211 }, { @@ -64134,27 +64134,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1870.0, - "completions/mean_length": 739.970703125, - "completions/mean_terminated_length": 700.492919921875, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 906.55078125, + "completions/mean_terminated_length": 876.8136596679688, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.7551421012204489, - "grad_norm": 1.701364517211914, - "kl": 5.03125, - "learning_rate": 2.5483413254009666e-07, - "loss": 0.3306, - "num_tokens": 1196186163.0, - "reward": 1.87939453125, - "reward_std": 0.5180987119674683, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.1979798823595047, + "grad_norm": 2.355698585510254, + "kl": 2.67578125, + "learning_rate": 2.5494437269908976e-07, + "loss": 0.137, + "num_tokens": 1244888637.0, + "reward": 1.107421875, + "reward_std": 0.3455817997455597, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.19549374282360077, "step": 2212 }, { @@ -64163,27 +64163,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 877.142578125, - "completions/mean_terminated_length": 817.0369873046875, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1042.615234375, + "completions/mean_terminated_length": 1018.4860229492188, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, "epoch": 0.7554834855338397, - "grad_norm": 1.9199590682983398, - "kl": 6.8203125, - "learning_rate": 2.544296734417658e-07, - "loss": 0.4529, - "num_tokens": 1196708844.0, - "reward": 1.82861328125, - "reward_std": 0.5699664354324341, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.20587307214736938, + "grad_norm": 2.3014628887176514, + "kl": 2.423828125, + "learning_rate": 2.545396460331529e-07, + "loss": 0.1327, + "num_tokens": 1245496040.0, + "reward": 1.10498046875, + "reward_std": 0.3370698094367981, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.16196657717227936, "step": 2213 }, { @@ -64194,25 +64194,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 882.943359375, - "completions/mean_terminated_length": 850.1907348632812, - "completions/min_length": 196.0, - "completions/min_terminated_length": 196.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1076.5625, + "completions/mean_terminated_length": 1049.2529296875, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, "epoch": 0.7558248698472305, - "grad_norm": 1.002043604850769, - "kl": 6.078125, - "learning_rate": 2.540256338514528e-07, - "loss": 0.3203, - "num_tokens": 1197239423.0, - "reward": 1.83203125, - "reward_std": 0.6202792525291443, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.912109375, - "rewards/tag_count_reward/std": 0.20413975417613983, + "grad_norm": 4.6682329177856445, + "kl": 1.9912109375, + "learning_rate": 2.5413533903738184e-07, + "loss": 0.0721, + "num_tokens": 1246125752.0, + "reward": 1.15478515625, + "reward_std": 0.35795092582702637, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.17312757670879364, "step": 2214 }, { @@ -64221,27 +64221,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 854.2890625, - "completions/mean_terminated_length": 810.7935180664062, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1011.453125, + "completions/mean_terminated_length": 986.5760498046875, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, "epoch": 0.7561662541606213, - "grad_norm": 1.092342495918274, - "kl": 6.5390625, - "learning_rate": 2.536220143426182e-07, - "loss": 0.3901, - "num_tokens": 1197746675.0, - "reward": 1.86572265625, - "reward_std": 0.5874383449554443, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.2022770643234253, + "grad_norm": 2.1165542602539062, + "kl": 1.845703125, + "learning_rate": 2.5373145228605103e-07, + "loss": 0.0648, + "num_tokens": 1246713472.0, + "reward": 1.1064453125, + "reward_std": 0.31755271553993225, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.18260227143764496, "step": 2215 }, { @@ -64250,27 +64250,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1829.0, - "completions/mean_length": 835.908203125, - "completions/mean_terminated_length": 791.742919921875, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 974.40234375, + "completions/mean_terminated_length": 955.1928100585938, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, "epoch": 0.7565076384740121, - "grad_norm": 1.3190197944641113, - "kl": 7.03125, - "learning_rate": 2.532188154881258e-07, - "loss": 0.4324, - "num_tokens": 1198247268.0, - "reward": 1.87646484375, - "reward_std": 0.5922967195510864, - "rewards/accuracy_reward/mean": 0.10080645233392715, - "rewards/accuracy_reward/std": 0.30137622356414795, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.1933453381061554, + "grad_norm": 2.2689080238342285, + "kl": 2.51171875, + "learning_rate": 2.5332798635283947e-07, + "loss": 0.1149, + "num_tokens": 1247284974.0, + "reward": 1.11767578125, + "reward_std": 0.34139201045036316, + "rewards/accuracy_reward/mean": 0.1572580635547638, + "rewards/accuracy_reward/std": 0.36441144347190857, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.18737702071666718, "step": 2216 }, { @@ -64279,27 +64279,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 853.01953125, - "completions/mean_terminated_length": 783.8883666992188, - "completions/min_length": 94.0, - "completions/min_terminated_length": 94.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1001.1796875, + "completions/mean_terminated_length": 980.3267211914062, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.756849022787403, - "grad_norm": 4.725752353668213, - "kl": 8.8203125, - "learning_rate": 2.528160378602431e-07, - "loss": 0.5049, - "num_tokens": 1198756238.0, - "reward": 1.82861328125, - "reward_std": 0.5383328795433044, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.21307136118412018, + "grad_norm": 1.5622797012329102, + "kl": 2.640625, + "learning_rate": 2.5292494181082726e-07, + "loss": 0.1521, + "num_tokens": 1247869802.0, + "reward": 1.10546875, + "reward_std": 0.3828883171081543, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.17416280508041382, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.18666234612464905, "step": 2217 }, { @@ -64308,27 +64308,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 858.107421875, - "completions/mean_terminated_length": 807.2159423828125, - "completions/min_length": 54.0, - "completions/min_terminated_length": 54.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1008.271484375, + "completions/mean_terminated_length": 991.7679443359375, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, "epoch": 0.7571904071007938, - "grad_norm": 5.096704483032227, - "kl": 8.921875, - "learning_rate": 2.5241368203063875e-07, - "loss": 0.4461, - "num_tokens": 1199267301.0, - "reward": 1.8154296875, - "reward_std": 0.590232253074646, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.22159916162490845, + "grad_norm": 2.656470537185669, + "kl": 1.96484375, + "learning_rate": 2.5252231923249703e-07, + "loss": 0.0688, + "num_tokens": 1248457749.0, + "reward": 1.11767578125, + "reward_std": 0.3618015646934509, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17875948548316956, "step": 2218 }, { @@ -64337,27 +64337,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 881.2734375, - "completions/mean_terminated_length": 818.85595703125, - "completions/min_length": 3.0, - "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1030.08984375, + "completions/mean_terminated_length": 1005.6600341796875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, "epoch": 0.7575317914141845, - "grad_norm": 2.777573585510254, - "kl": 7.6875, - "learning_rate": 2.5201174857038344e-07, - "loss": 0.4052, - "num_tokens": 1199792113.0, - "reward": 1.75732421875, - "reward_std": 0.6155025959014893, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.90185546875, - "rewards/tag_count_reward/std": 0.22775058448314667, + "grad_norm": 3.818528413772583, + "kl": 1.6171875, + "learning_rate": 2.5212011918973085e-07, + "loss": 0.0836, + "num_tokens": 1249058755.0, + "reward": 1.041015625, + "reward_std": 0.28096121549606323, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.1547197550535202, "step": 2219 }, { @@ -64368,25 +64368,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 844.798828125, - "completions/mean_terminated_length": 800.95751953125, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1053.453125, + "completions/mean_terminated_length": 1017.214599609375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.7578731757275753, - "grad_norm": 2.8424341678619385, - "kl": 8.015625, - "learning_rate": 2.516102380499483e-07, - "loss": 0.4617, - "num_tokens": 1200305802.0, - "reward": 1.79638671875, - "reward_std": 0.5715229511260986, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.2052827626466751, + "grad_norm": 1.8749295473098755, + "kl": 2.30078125, + "learning_rate": 2.517183422538122e-07, + "loss": 0.1136, + "num_tokens": 1249679275.0, + "reward": 1.0576171875, + "reward_std": 0.3195943236351013, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.16662302613258362, "step": 2220 }, { @@ -64395,27 +64395,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1943.0, - "completions/mean_length": 841.0390625, - "completions/mean_terminated_length": 791.9755859375, - "completions/min_length": 197.0, - "completions/min_terminated_length": 197.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1005.833984375, + "completions/mean_terminated_length": 976.5361328125, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, "epoch": 0.7582145600409661, - "grad_norm": 1.5034443140029907, - "kl": 7.1796875, - "learning_rate": 2.51209151039204e-07, - "loss": 0.4459, - "num_tokens": 1200812878.0, - "reward": 1.90966796875, - "reward_std": 0.5487245321273804, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.196482852101326, + "grad_norm": 3.0081217288970947, + "kl": 2.361328125, + "learning_rate": 2.513169889954221e-07, + "loss": 0.0981, + "num_tokens": 1250270726.0, + "reward": 1.1669921875, + "reward_std": 0.4079582691192627, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39069411158561707, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.1844143271446228, "step": 2221 }, { @@ -64424,27 +64424,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1952.0, - "completions/mean_length": 868.2265625, - "completions/mean_terminated_length": 797.3912963867188, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1042.05078125, + "completions/mean_terminated_length": 1015.8436889648438, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, "epoch": 0.7585559443543569, - "grad_norm": 1.365991473197937, - "kl": 7.8203125, - "learning_rate": 2.5080848810742027e-07, - "loss": 0.4995, - "num_tokens": 1201334722.0, - "reward": 1.83056640625, - "reward_std": 0.5755613446235657, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.2162717580795288, + "grad_norm": 1.5602948665618896, + "kl": 2.3984375, + "learning_rate": 2.509160599846407e-07, + "loss": 0.114, + "num_tokens": 1250881568.0, + "reward": 1.1103515625, + "reward_std": 0.33246347308158875, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.1920890361070633, "step": 2222 }, { @@ -64453,27 +64453,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.068359375, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1850.0, - "completions/mean_length": 929.384765625, - "completions/mean_terminated_length": 847.3060302734375, - "completions/min_length": 115.0, - "completions/min_terminated_length": 115.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1158.548828125, + "completions/mean_terminated_length": 1107.0928955078125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.7588973286677477, - "grad_norm": 1.0183286666870117, - "kl": 7.46875, - "learning_rate": 2.504082498232648e-07, - "loss": 0.4397, - "num_tokens": 1201896583.0, - "reward": 1.78076171875, - "reward_std": 0.5611814856529236, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.21210047602653503, + "grad_norm": 8.4295654296875, + "kl": 3.251953125, + "learning_rate": 2.5051555579094493e-07, + "loss": 0.1288, + "num_tokens": 1251560761.0, + "reward": 1.01953125, + "reward_std": 0.3765547275543213, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.2182645946741104, "step": 2223 }, { @@ -64482,27 +64482,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 826.6640625, - "completions/mean_terminated_length": 797.35205078125, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 992.859375, + "completions/mean_terminated_length": 971.8406372070312, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, "epoch": 0.7592387129811385, - "grad_norm": 1.4680759906768799, - "kl": 6.6015625, - "learning_rate": 2.5000843675480264e-07, - "loss": 0.4407, - "num_tokens": 1202390939.0, - "reward": 1.87548828125, - "reward_std": 0.5191822052001953, - "rewards/accuracy_reward/mean": 0.07056451588869095, - "rewards/accuracy_reward/std": 0.25635457038879395, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.18617989122867584, + "grad_norm": 3.7181661128997803, + "kl": 2.107421875, + "learning_rate": 2.501154769832089e-07, + "loss": 0.1094, + "num_tokens": 1252140209.0, + "reward": 1.07470703125, + "reward_std": 0.317842960357666, + "rewards/accuracy_reward/mean": 0.11693548411130905, + "rewards/accuracy_reward/std": 0.3216678202152252, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17738577723503113, "step": 2224 }, { @@ -64511,27 +64511,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 865.591796875, - "completions/mean_terminated_length": 822.5081176757812, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1094.4375, + "completions/mean_terminated_length": 1061.68896484375, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, "epoch": 0.7595800972945294, - "grad_norm": 1.8334529399871826, - "kl": 6.265625, - "learning_rate": 2.4960904946949513e-07, - "loss": 0.4026, - "num_tokens": 1202907738.0, - "reward": 1.81298828125, - "reward_std": 0.5341126322746277, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.2117307335138321, + "grad_norm": 3.99417781829834, + "kl": 2.091796875, + "learning_rate": 2.4971582412970195e-07, + "loss": 0.077, + "num_tokens": 1252774177.0, + "reward": 1.04296875, + "reward_std": 0.32362744212150574, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.17341503500938416, "step": 2225 }, { @@ -64540,27 +64540,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 836.8984375, - "completions/mean_terminated_length": 795.3051147460938, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1023.025390625, + "completions/mean_terminated_length": 1002.6076049804688, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.7599214816079202, - "grad_norm": 3.2000937461853027, - "kl": 6.890625, - "learning_rate": 2.492100885341997e-07, - "loss": 0.4656, - "num_tokens": 1203409974.0, - "reward": 1.82177734375, - "reward_std": 0.586864173412323, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.2066698521375656, + "grad_norm": 2.892289876937866, + "kl": 2.1298828125, + "learning_rate": 2.4931659779808874e-07, + "loss": 0.1017, + "num_tokens": 1253371710.0, + "reward": 1.07958984375, + "reward_std": 0.33189767599105835, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.19132331013679504, "step": 2226 }, { @@ -64569,27 +64569,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 892.263671875, - "completions/mean_terminated_length": 837.9038696289062, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1107.267578125, + "completions/mean_terminated_length": 1052.844970703125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.7602628659213109, - "grad_norm": 0.8177643418312073, - "kl": 6.3125, - "learning_rate": 2.4881155451516844e-07, - "loss": 0.4118, - "num_tokens": 1203950845.0, - "reward": 1.90087890625, - "reward_std": 0.5213083624839783, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.20633205771446228, + "grad_norm": 2.559948444366455, + "kl": 2.431640625, + "learning_rate": 2.489177985554282e-07, + "loss": 0.1625, + "num_tokens": 1254022663.0, + "reward": 1.107421875, + "reward_std": 0.36859750747680664, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.196245014667511, "step": 2227 }, { @@ -64598,27 +64598,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 844.009765625, - "completions/mean_terminated_length": 787.38037109375, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1061.900390625, + "completions/mean_terminated_length": 1023.8965454101562, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.7606042502347017, - "grad_norm": 2.746137857437134, - "kl": 5.3671875, - "learning_rate": 2.484134479780473e-07, - "loss": 0.3835, - "num_tokens": 1204455746.0, - "reward": 1.83837890625, - "reward_std": 0.4725128412246704, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.19754503667354584, + "grad_norm": 3.3067076206207275, + "kl": 2.17578125, + "learning_rate": 2.485194269681723e-07, + "loss": 0.1171, + "num_tokens": 1254639124.0, + "reward": 0.99853515625, + "reward_std": 0.2973896861076355, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17416280508041382, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.19126836955547333, "step": 2228 }, { @@ -64627,27 +64627,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.06640625, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1887.0, - "completions/mean_length": 879.423828125, - "completions/mean_terminated_length": 796.3033447265625, - "completions/min_length": 222.0, - "completions/min_terminated_length": 222.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1071.14453125, + "completions/mean_terminated_length": 1045.6954345703125, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, "epoch": 0.7609456345480925, - "grad_norm": 1.2507325410842896, - "kl": 7.0546875, - "learning_rate": 2.48015769487876e-07, - "loss": 0.448, - "num_tokens": 1204982187.0, - "reward": 1.7958984375, - "reward_std": 0.5250375866889954, - "rewards/accuracy_reward/mean": 0.04233871027827263, - "rewards/accuracy_reward/std": 0.2015640139579773, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.21008896827697754, + "grad_norm": 1.8414686918258667, + "kl": 2.1669921875, + "learning_rate": 2.481214836021657e-07, + "loss": 0.1146, + "num_tokens": 1255263726.0, + "reward": 1.08935546875, + "reward_std": 0.35469865798950195, + "rewards/accuracy_reward/mean": 0.10483870655298233, + "rewards/accuracy_reward/std": 0.30665475130081177, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.18092724680900574, "step": 2229 }, { @@ -64656,27 +64656,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 856.7421875, - "completions/mean_terminated_length": 798.1557006835938, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1051.2265625, + "completions/mean_terminated_length": 1000.0575561523438, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, "epoch": 0.7612870188614833, - "grad_norm": 1.2659684419631958, - "kl": 7.09375, - "learning_rate": 2.476185196090862e-07, - "loss": 0.4265, - "num_tokens": 1205496663.0, - "reward": 1.810546875, - "reward_std": 0.5451551675796509, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.2096906155347824, + "grad_norm": 2.4028053283691406, + "kl": 3.125, + "learning_rate": 2.4772396902264505e-07, + "loss": 0.2008, + "num_tokens": 1255877778.0, + "reward": 1.03076171875, + "reward_std": 0.3477475941181183, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.2057616412639618, "step": 2230 }, { @@ -64685,27 +64685,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1946.0, - "completions/mean_length": 799.755859375, - "completions/mean_terminated_length": 762.0824584960938, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 974.673828125, + "completions/mean_terminated_length": 933.3082885742188, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, "epoch": 0.7616284031748741, - "grad_norm": 1.7775336503982544, - "kl": 6.5, - "learning_rate": 2.472216989055015e-07, - "loss": 0.4011, - "num_tokens": 1205978778.0, - "reward": 1.81396484375, - "reward_std": 0.576111912727356, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.21019071340560913, + "grad_norm": 2.8248589038848877, + "kl": 2.59375, + "learning_rate": 2.4732688379423744e-07, + "loss": 0.121, + "num_tokens": 1256449451.0, + "reward": 1.04296875, + "reward_std": 0.29123976826667786, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.18413691222667694, "step": 2231 }, { @@ -64714,27 +64714,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 905.30859375, - "completions/mean_terminated_length": 844.1769409179688, - "completions/min_length": 184.0, - "completions/min_terminated_length": 184.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1107.427734375, + "completions/mean_terminated_length": 1067.1995849609375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.7619697874882649, - "grad_norm": 2.275806188583374, - "kl": 8.7109375, - "learning_rate": 2.468253079403362e-07, - "loss": 0.5015, - "num_tokens": 1206514760.0, - "reward": 1.7783203125, - "reward_std": 0.6185402870178223, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.8974609375, - "rewards/tag_count_reward/std": 0.23141992092132568, + "grad_norm": 4.481600284576416, + "kl": 1.8720703125, + "learning_rate": 2.4693022848096054e-07, + "loss": 0.0813, + "num_tokens": 1257088918.0, + "reward": 1.09375, + "reward_std": 0.3368939161300659, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.17270830273628235, "step": 2232 }, { @@ -64743,27 +64743,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 831.134765625, - "completions/mean_terminated_length": 768.6673583984375, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1032.6171875, + "completions/mean_terminated_length": 984.85888671875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.7623111718016558, - "grad_norm": 1.531739592552185, - "kl": 8.0078125, - "learning_rate": 2.464293472761948e-07, - "loss": 0.5286, - "num_tokens": 1207030653.0, - "reward": 1.82177734375, - "reward_std": 0.5514932870864868, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.20447123050689697, + "grad_norm": 5.15520715713501, + "kl": 2.458984375, + "learning_rate": 2.465340036462213e-07, + "loss": 0.1327, + "num_tokens": 1257707970.0, + "reward": 1.07373046875, + "reward_std": 0.3484814465045929, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.17416280508041382, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.2008453756570816, "step": 2233 }, { @@ -64772,27 +64772,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1955.0, - "completions/mean_length": 854.140625, - "completions/mean_terminated_length": 813.139404296875, - "completions/min_length": 89.0, - "completions/min_terminated_length": 89.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1110.82421875, + "completions/mean_terminated_length": 1054.554931640625, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, "epoch": 0.7626525561150466, - "grad_norm": 1.9341306686401367, - "kl": 6.859375, - "learning_rate": 2.460338174750713e-07, - "loss": 0.3934, - "num_tokens": 1207549829.0, - "reward": 1.77490234375, - "reward_std": 0.5592837333679199, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.2089642435312271, + "grad_norm": 4.962558746337891, + "kl": 2.69921875, + "learning_rate": 2.4613820985281524e-07, + "loss": 0.1345, + "num_tokens": 1258358568.0, + "reward": 1.0185546875, + "reward_std": 0.33296477794647217, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.9091796875, + "rewards/tag_count_reward/std": 0.20524843037128448, "step": 2234 }, { @@ -64801,27 +64801,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1849.0, - "completions/mean_length": 813.28125, - "completions/mean_terminated_length": 752.5573120117188, - "completions/min_length": 76.0, - "completions/min_terminated_length": 76.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1046.9609375, + "completions/mean_terminated_length": 1004.1466674804688, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, "epoch": 0.7629939404284373, - "grad_norm": 1.682078242301941, - "kl": 7.671875, - "learning_rate": 2.4563871909834755e-07, - "loss": 0.4935, - "num_tokens": 1208044581.0, - "reward": 1.83203125, - "reward_std": 0.5297613739967346, - "rewards/accuracy_reward/mean": 0.058467742055654526, - "rewards/accuracy_reward/std": 0.23486268520355225, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.1992122381925583, + "grad_norm": 2.2278881072998047, + "kl": 2.34765625, + "learning_rate": 2.457428476629253e-07, + "loss": 0.1471, + "num_tokens": 1258972964.0, + "reward": 1.0546875, + "reward_std": 0.33507412672042847, + "rewards/accuracy_reward/mean": 0.07459677755832672, + "rewards/accuracy_reward/std": 0.263004869222641, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.935546875, + "rewards/tag_count_reward/std": 0.1810387820005417, "step": 2235 }, { @@ -64830,27 +64830,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 843.78125, - "completions/mean_terminated_length": 804.9354858398438, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1040.38671875, + "completions/mean_terminated_length": 999.4268188476562, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, "epoch": 0.7633353247418281, - "grad_norm": 1.6867430210113525, - "kl": 7.328125, - "learning_rate": 2.4524405270679386e-07, - "loss": 0.4435, - "num_tokens": 1208562085.0, - "reward": 1.82177734375, - "reward_std": 0.5510420203208923, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.24230584502220154, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.2060771882534027, + "grad_norm": 2.150195837020874, + "kl": 2.26953125, + "learning_rate": 2.45347917638122e-07, + "loss": 0.1363, + "num_tokens": 1259591130.0, + "reward": 1.06640625, + "reward_std": 0.29895350337028503, + "rewards/accuracy_reward/mean": 0.0947580635547638, + "rewards/accuracy_reward/std": 0.29317617416381836, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.16621248424053192, "step": 2236 }, { @@ -64859,27 +64859,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 865.666015625, - "completions/mean_terminated_length": 817.6036376953125, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 1016.697265625, + "completions/mean_terminated_length": 970.3938598632812, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.7636767090552189, - "grad_norm": 2.1145455837249756, - "kl": 7.265625, - "learning_rate": 2.4484981886056647e-07, - "loss": 0.4384, - "num_tokens": 1209085882.0, - "reward": 1.81201171875, - "reward_std": 0.5959441661834717, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.2068454772233963, + "grad_norm": 1.7956606149673462, + "kl": 3.037109375, + "learning_rate": 2.4495342033936115e-07, + "loss": 0.1769, + "num_tokens": 1260192255.0, + "reward": 1.0703125, + "reward_std": 0.36397427320480347, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.17416280508041382, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.2041865587234497, "step": 2237 }, { @@ -64888,27 +64888,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1937.0, - "completions/mean_length": 848.09375, - "completions/mean_terminated_length": 794.2203979492188, - "completions/min_length": 58.0, - "completions/min_terminated_length": 58.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1056.0234375, + "completions/mean_terminated_length": 1019.8785400390625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.7640180933686097, - "grad_norm": 1.2139153480529785, - "kl": 5.515625, - "learning_rate": 2.444560181192087e-07, - "loss": 0.3383, - "num_tokens": 1209603162.0, - "reward": 1.88232421875, - "reward_std": 0.5427642464637756, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19449345767498016, + "grad_norm": 2.8235387802124023, + "kl": 2.7890625, + "learning_rate": 2.4455935632698474e-07, + "loss": 0.1211, + "num_tokens": 1260815995.0, + "reward": 1.0966796875, + "reward_std": 0.40870654582977295, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.9091796875, + "rewards/tag_count_reward/std": 0.20761838555335999, "step": 2238 }, { @@ -64917,27 +64917,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 869.572265625, - "completions/mean_terminated_length": 829.10107421875, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1085.259765625, + "completions/mean_terminated_length": 1060.1783447265625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.7643594776820005, - "grad_norm": 2.9974546432495117, - "kl": 5.22265625, - "learning_rate": 2.4406265104164814e-07, - "loss": 0.3595, - "num_tokens": 1210122351.0, - "reward": 1.87060546875, - "reward_std": 0.5087319016456604, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.18968312442302704, + "grad_norm": 2.217665910720825, + "kl": 2.53125, + "learning_rate": 2.4416572616071895e-07, + "loss": 0.1335, + "num_tokens": 1261445616.0, + "reward": 1.08056640625, + "reward_std": 0.33928707242012024, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.18900687992572784, "step": 2239 }, { @@ -64946,27 +64946,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 812.63671875, - "completions/mean_terminated_length": 775.3521118164062, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1027.1015625, + "completions/mean_terminated_length": 1006.7649536132812, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, "epoch": 0.7647008619953913, - "grad_norm": 2.801384925842285, - "kl": 4.4375, - "learning_rate": 2.4366971818619785e-07, - "loss": 0.3232, - "num_tokens": 1210618165.0, - "reward": 1.943359375, - "reward_std": 0.500016450881958, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.17884030938148499, + "grad_norm": 3.055691957473755, + "kl": 2.099609375, + "learning_rate": 2.4377253039967396e-07, + "loss": 0.1134, + "num_tokens": 1262051236.0, + "reward": 1.1318359375, + "reward_std": 0.35164132714271545, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.18845312297344208, "step": 2240 }, { @@ -64975,27 +64975,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1958.0, - "completions/mean_length": 897.6953125, - "completions/mean_terminated_length": 826.099609375, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1092.228515625, + "completions/mean_terminated_length": 1059.404052734375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, "epoch": 0.7650422463087821, - "grad_norm": 1.5173786878585815, - "kl": 6.7734375, - "learning_rate": 2.4327722011055407e-07, - "loss": 0.4403, - "num_tokens": 1211150841.0, - "reward": 1.77783203125, - "reward_std": 0.5543250441551208, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.2249559760093689, + "grad_norm": 5.009459972381592, + "kl": 2.705078125, + "learning_rate": 2.433797696023424e-07, + "loss": 0.1668, + "num_tokens": 1262683513.0, + "reward": 1.04541015625, + "reward_std": 0.32103317975997925, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.025390625, + "rewards/format_reward/std": 0.15746226906776428, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.19157785177230835, "step": 2241 }, { @@ -65004,27 +65004,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 839.140625, - "completions/mean_terminated_length": 782.2821655273438, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1040.01171875, + "completions/mean_terminated_length": 977.27392578125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.765383630622173, - "grad_norm": 0.8445493578910828, - "kl": 6.60546875, - "learning_rate": 2.428851573717961e-07, - "loss": 0.4365, - "num_tokens": 1211663329.0, - "reward": 1.86767578125, - "reward_std": 0.5464333295822144, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.19940820336341858, + "grad_norm": 3.4729323387145996, + "kl": 2.47265625, + "learning_rate": 2.4298744432659973e-07, + "loss": 0.1464, + "num_tokens": 1263298847.0, + "reward": 1.13330078125, + "reward_std": 0.39557763934135437, + "rewards/accuracy_reward/mean": 0.173828125, + "rewards/accuracy_reward/std": 0.3793322443962097, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.91455078125, + "rewards/tag_count_reward/std": 0.20126360654830933, "step": 2242 }, { @@ -65033,27 +65033,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1965.0, - "completions/mean_length": 874.228515625, - "completions/mean_terminated_length": 828.9918823242188, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1102.998046875, + "completions/mean_terminated_length": 1068.5648193359375, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, "epoch": 0.7657250149355637, - "grad_norm": 1.9729790687561035, - "kl": 5.7265625, - "learning_rate": 2.42493530526385e-07, - "loss": 0.3744, - "num_tokens": 1212189494.0, - "reward": 1.83740234375, - "reward_std": 0.48765993118286133, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.19032683968544006, + "grad_norm": 9.142526626586914, + "kl": 3.12109375, + "learning_rate": 2.4259555512970206e-07, + "loss": 0.1074, + "num_tokens": 1263942142.0, + "reward": 1.06640625, + "reward_std": 0.3728174865245819, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.19499453902244568, "step": 2243 }, { @@ -65062,27 +65062,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 823.65625, - "completions/mean_terminated_length": 786.7042236328125, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1085.685546875, + "completions/mean_terminated_length": 1001.917236328125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, "epoch": 0.7660663992489545, - "grad_norm": 1.0354989767074585, - "kl": 5.359375, - "learning_rate": 2.421023401301636e-07, - "loss": 0.3262, - "num_tokens": 1212696454.0, - "reward": 1.85009765625, - "reward_std": 0.5369447469711304, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.19817768037319183, + "grad_norm": 1.9588756561279297, + "kl": 3.26953125, + "learning_rate": 2.422041025682869e-07, + "loss": 0.1697, + "num_tokens": 1264583261.0, + "reward": 1.0537109375, + "reward_std": 0.33580493927001953, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.8857421875, + "rewards/tag_count_reward/std": 0.2317335158586502, "step": 2244 }, { @@ -65091,27 +65091,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 843.009765625, - "completions/mean_terminated_length": 814.0900268554688, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1073.7421875, + "completions/mean_terminated_length": 1036.1947021484375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.7664077835623453, - "grad_norm": 1.4246184825897217, - "kl": 5.5546875, - "learning_rate": 2.417115867383553e-07, - "loss": 0.317, - "num_tokens": 1213208827.0, - "reward": 1.82373046875, - "reward_std": 0.4728469252586365, - "rewards/accuracy_reward/mean": 0.03427419438958168, - "rewards/accuracy_reward/std": 0.18211629986763, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.18766237795352936, + "grad_norm": 3.4614264965057373, + "kl": 2.44921875, + "learning_rate": 2.4181308719837103e-07, + "loss": 0.1533, + "num_tokens": 1265213769.0, + "reward": 1.0205078125, + "reward_std": 0.27063578367233276, + "rewards/accuracy_reward/mean": 0.05645161122083664, + "rewards/accuracy_reward/std": 0.23102475702762604, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.1784525215625763, "step": 2245 }, { @@ -65120,27 +65120,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1871.0, - "completions/mean_length": 812.123046875, - "completions/mean_terminated_length": 764.4928588867188, - "completions/min_length": 81.0, - "completions/min_terminated_length": 81.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1070.85546875, + "completions/mean_terminated_length": 1024.8956298828125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.7667491678757361, - "grad_norm": 2.089632272720337, - "kl": 7.046875, - "learning_rate": 2.4132127090556265e-07, - "loss": 0.4163, - "num_tokens": 1213712970.0, - "reward": 1.8505859375, - "reward_std": 0.6257754564285278, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.9111328125, - "rewards/tag_count_reward/std": 0.21021628379821777, + "grad_norm": 3.315122127532959, + "kl": 2.56640625, + "learning_rate": 2.414225095753506e-07, + "loss": 0.1277, + "num_tokens": 1265850383.0, + "reward": 1.11474609375, + "reward_std": 0.3514671325683594, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.19716253876686096, "step": 2246 }, { @@ -65149,27 +65149,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 791.880859375, - "completions/mean_terminated_length": 748.741455078125, - "completions/min_length": 33.0, - "completions/min_terminated_length": 33.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1040.91796875, + "completions/mean_terminated_length": 1008.431396484375, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, "epoch": 0.7670905521891269, - "grad_norm": 1.114135503768921, - "kl": 6.640625, - "learning_rate": 2.4093139318576793e-07, - "loss": 0.4021, - "num_tokens": 1214196013.0, - "reward": 1.83935546875, - "reward_std": 0.5053403973579407, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.1845095157623291, + "grad_norm": 4.62288761138916, + "kl": 2.34765625, + "learning_rate": 2.4103237025399946e-07, + "loss": 0.1043, + "num_tokens": 1266460933.0, + "reward": 1.0458984375, + "reward_std": 0.3049134612083435, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.1804969757795334, "step": 2247 }, { @@ -65178,27 +65178,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1933.0, - "completions/mean_length": 795.109375, - "completions/mean_terminated_length": 741.5234375, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 983.7578125, + "completions/mean_terminated_length": 940.4959106445312, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.7674319365025177, - "grad_norm": 1.6775933504104614, - "kl": 7.2265625, - "learning_rate": 2.405419541323314e-07, - "loss": 0.4287, - "num_tokens": 1214684757.0, - "reward": 1.849609375, - "reward_std": 0.5888253450393677, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.21164102852344513, + "grad_norm": 6.021011829376221, + "kl": 2.734375, + "learning_rate": 2.406426697884696e-07, + "loss": 0.1978, + "num_tokens": 1267046265.0, + "reward": 1.0947265625, + "reward_std": 0.31461101770401, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.20006181299686432, "step": 2248 }, { @@ -65207,27 +65207,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 815.65625, - "completions/mean_terminated_length": 781.0120239257812, - "completions/min_length": 51.0, - "completions/min_terminated_length": 51.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1056.529296875, + "completions/mean_terminated_length": 1018.3184204101562, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, "epoch": 0.7677733208159085, - "grad_norm": 2.696317672729492, - "kl": 6.54296875, - "learning_rate": 2.4015295429799e-07, - "loss": 0.3596, - "num_tokens": 1215179813.0, - "reward": 1.8779296875, - "reward_std": 0.5376981496810913, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.1959892213344574, + "grad_norm": 7.167107582092285, + "kl": 3.22265625, + "learning_rate": 2.4025340873228897e-07, + "loss": 0.1643, + "num_tokens": 1267664648.0, + "reward": 1.09716796875, + "reward_std": 0.3739963173866272, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.19546380639076233, "step": 2249 }, { @@ -65236,27 +65236,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 914.142578125, - "completions/mean_terminated_length": 865.647705078125, - "completions/min_length": 238.0, - "completions/min_terminated_length": 238.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1072.5546875, + "completions/mean_terminated_length": 1037.01220703125, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, "epoch": 0.7681147051292994, - "grad_norm": 1.5230695009231567, - "kl": 7.40625, - "learning_rate": 2.397643942348584e-07, - "loss": 0.4488, - "num_tokens": 1215723390.0, - "reward": 1.79248046875, - "reward_std": 0.6309556365013123, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.21193371713161469, + "grad_norm": 3.9639720916748047, + "kl": 2.140625, + "learning_rate": 2.3986458763836177e-07, + "loss": 0.0796, + "num_tokens": 1268289332.0, + "reward": 1.048828125, + "reward_std": 0.3164241313934326, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.18146054446697235, "step": 2250 }, { @@ -65265,27 +65265,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1863.0, - "completions/mean_length": 840.552734375, - "completions/mean_terminated_length": 791.469482421875, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1100.318359375, + "completions/mean_terminated_length": 1063.795166015625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.7684560894426901, - "grad_norm": 1.3939034938812256, - "kl": 6.22265625, - "learning_rate": 2.393762744944261e-07, - "loss": 0.3971, - "num_tokens": 1216230905.0, - "reward": 1.8212890625, - "reward_std": 0.523719072341919, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9287109375, - "rewards/tag_count_reward/std": 0.1935756355524063, + "grad_norm": 5.53609037399292, + "kl": 2.138671875, + "learning_rate": 2.3947620705896734e-07, + "loss": 0.0613, + "num_tokens": 1268929847.0, + "reward": 1.05224609375, + "reward_std": 0.322548508644104, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.1771216243505478, "step": 2251 }, { @@ -65294,27 +65294,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1974.0, - "completions/mean_length": 839.083984375, - "completions/mean_terminated_length": 815.0020141601562, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1107.16796875, + "completions/mean_terminated_length": 1058.8707275390625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, "epoch": 0.7687974737560809, - "grad_norm": 1.0364001989364624, - "kl": 5.78125, - "learning_rate": 2.389885956275585e-07, - "loss": 0.3586, - "num_tokens": 1216740964.0, - "reward": 1.83154296875, - "reward_std": 0.5374894142150879, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.1958593726158142, + "grad_norm": 8.220803260803223, + "kl": 2.80859375, + "learning_rate": 2.3908826754575923e-07, + "loss": 0.2052, + "num_tokens": 1269577165.0, + "reward": 1.0205078125, + "reward_std": 0.3072895109653473, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.20556476712226868, "step": 2252 }, { @@ -65323,27 +65323,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1950.0, - "completions/mean_length": 793.83984375, - "completions/mean_terminated_length": 773.9325561523438, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1055.041015625, + "completions/mean_terminated_length": 1008.33740234375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.7691388580694717, - "grad_norm": 1.384509563446045, - "kl": 5.0625, - "learning_rate": 2.386013581844945e-07, - "loss": 0.303, - "num_tokens": 1217236050.0, - "reward": 1.8642578125, - "reward_std": 0.4752752482891083, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.17448893189430237, + "grad_norm": 2.3903872966766357, + "kl": 3.42578125, + "learning_rate": 2.3870076964976424e-07, + "loss": 0.206, + "num_tokens": 1270205986.0, + "reward": 1.0302734375, + "reward_std": 0.3425173759460449, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.21143096685409546, "step": 2253 }, { @@ -65352,27 +65352,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1961.0, - "completions/mean_length": 816.7265625, - "completions/mean_terminated_length": 779.5653686523438, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1091.41796875, + "completions/mean_terminated_length": 1048.4693603515625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.7694802423828625, - "grad_norm": 1.182735800743103, - "kl": 6.3359375, - "learning_rate": 2.3821456271484704e-07, - "loss": 0.383, - "num_tokens": 1217732006.0, - "reward": 1.82470703125, - "reward_std": 0.5288053154945374, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.1953708976507187, + "grad_norm": 5.985517978668213, + "kl": 2.99609375, + "learning_rate": 2.3831371392138237e-07, + "loss": 0.1193, + "num_tokens": 1270842584.0, + "reward": 1.04150390625, + "reward_std": 0.35622867941856384, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.19172243773937225, "step": 2254 }, { @@ -65381,27 +65381,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 756.423828125, - "completions/mean_terminated_length": 730.6952514648438, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 981.365234375, + "completions/mean_terminated_length": 951.3794555664062, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, "epoch": 0.7698216266962533, - "grad_norm": 1.7976510524749756, - "kl": 4.87109375, - "learning_rate": 2.3782820976760153e-07, - "loss": 0.3435, - "num_tokens": 1218187535.0, - "reward": 1.99560546875, - "reward_std": 0.4458416700363159, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, - "rewards/format_reward/mean": 0.91015625, - "rewards/format_reward/std": 0.2862374484539032, - "rewards/tag_count_reward/mean": 0.95849609375, - "rewards/tag_count_reward/std": 0.1543913036584854, + "grad_norm": 5.019622325897217, + "kl": 2.416015625, + "learning_rate": 2.3792710091038506e-07, + "loss": 0.159, + "num_tokens": 1271413283.0, + "reward": 1.13037109375, + "reward_std": 0.3536721467971802, + "rewards/accuracy_reward/mean": 0.169921875, + "rewards/accuracy_reward/std": 0.3759314715862274, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.18511444330215454, "step": 2255 }, { @@ -65410,27 +65410,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1950.0, - "completions/mean_length": 823.265625, - "completions/mean_terminated_length": 776.0648803710938, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1041.447265625, + "completions/mean_terminated_length": 1004.7713012695312, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, "epoch": 0.7701630110096441, - "grad_norm": 0.7203352451324463, - "kl": 5.7578125, - "learning_rate": 2.374422998911151e-07, - "loss": 0.3601, - "num_tokens": 1218689927.0, - "reward": 1.880859375, - "reward_std": 0.4321858286857605, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.1843961924314499, + "grad_norm": 6.263581275939941, + "kl": 2.052734375, + "learning_rate": 2.3754093116591534e-07, + "loss": 0.1167, + "num_tokens": 1272027384.0, + "reward": 1.04248046875, + "reward_std": 0.3408612012863159, + "rewards/accuracy_reward/mean": 0.0786290317773819, + "rewards/accuracy_reward/std": 0.26943066716194153, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.19039209187030792, "step": 2256 }, { @@ -65439,27 +65439,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 843.36328125, - "completions/mean_terminated_length": 811.97998046875, - "completions/min_length": 124.0, - "completions/min_terminated_length": 124.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1073.076171875, + "completions/mean_terminated_length": 1043.65185546875, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, "epoch": 0.7705043953230349, - "grad_norm": 1.8895782232284546, - "kl": 5.0390625, - "learning_rate": 2.3705683363311656e-07, - "loss": 0.3478, - "num_tokens": 1219197249.0, - "reward": 1.908203125, - "reward_std": 0.4586414396762848, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.1733599156141281, + "grad_norm": 1.9384559392929077, + "kl": 2.447265625, + "learning_rate": 2.3715520523648647e-07, + "loss": 0.1337, + "num_tokens": 1272652319.0, + "reward": 1.044921875, + "reward_std": 0.34502896666526794, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.19659529626369476, "step": 2257 }, { @@ -65468,27 +65468,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 793.478515625, - "completions/mean_terminated_length": 745.1298217773438, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1042.369140625, + "completions/mean_terminated_length": 1014.0983276367188, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.7708457796364258, - "grad_norm": 1.2977958917617798, - "kl": 7.59375, - "learning_rate": 2.3667181154070443e-07, - "loss": 0.4755, - "num_tokens": 1219677638.0, - "reward": 1.81689453125, - "reward_std": 0.4978317320346832, - "rewards/accuracy_reward/mean": 0.032258063554763794, - "rewards/accuracy_reward/std": 0.17686307430267334, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.1847475916147232, + "grad_norm": 2.814941883087158, + "kl": 2.173828125, + "learning_rate": 2.3676992366998136e-07, + "loss": 0.1259, + "num_tokens": 1273260140.0, + "reward": 1.03466796875, + "reward_std": 0.2768687903881073, + "rewards/accuracy_reward/mean": 0.07258064299821854, + "rewards/accuracy_reward/std": 0.25970885157585144, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.18054130673408508, "step": 2258 }, { @@ -65497,27 +65497,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1957.0, - "completions/mean_length": 842.4609375, - "completions/mean_terminated_length": 811.0541381835938, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1112.818359375, + "completions/mean_terminated_length": 1084.593505859375, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, "epoch": 0.7711871639498165, - "grad_norm": 1.7056665420532227, - "kl": 7.6796875, - "learning_rate": 2.3628723416034742e-07, - "loss": 0.4458, - "num_tokens": 1220177330.0, - "reward": 1.83935546875, - "reward_std": 0.6094324588775635, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.19778190553188324, + "grad_norm": 1.5303236246109009, + "kl": 1.8232421875, + "learning_rate": 2.3638508701365153e-07, + "loss": 0.0822, + "num_tokens": 1273898255.0, + "reward": 1.12451171875, + "reward_std": 0.3661800026893616, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.17775706946849823, "step": 2259 }, { @@ -65528,25 +65528,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 805.095703125, - "completions/mean_terminated_length": 754.5711059570312, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1044.966796875, + "completions/mean_terminated_length": 1004.1930541992188, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, "epoch": 0.7715285482632073, - "grad_norm": 2.854001998901367, - "kl": 8.21875, - "learning_rate": 2.359031020378827e-07, - "loss": 0.5032, - "num_tokens": 1220664371.0, - "reward": 1.85986328125, - "reward_std": 0.6096511483192444, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.22019492089748383, + "grad_norm": 1.779982089996338, + "kl": 2.46875, + "learning_rate": 2.3600069581411693e-07, + "loss": 0.1561, + "num_tokens": 1274508110.0, + "reward": 1.11474609375, + "reward_std": 0.3632776141166687, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.1952926218509674, "step": 2260 }, { @@ -65555,27 +65555,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 806.982421875, - "completions/mean_terminated_length": 766.9495849609375, - "completions/min_length": 58.0, - "completions/min_terminated_length": 58.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 996.109375, + "completions/mean_terminated_length": 973.0139770507812, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, "epoch": 0.7718699325765981, - "grad_norm": 2.7655861377716064, - "kl": 8.59375, - "learning_rate": 2.3551941571851534e-07, - "loss": 0.5227, - "num_tokens": 1221145738.0, - "reward": 1.78662109375, - "reward_std": 0.5410267114639282, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.20045962929725647, + "grad_norm": 3.0474798679351807, + "kl": 3.404296875, + "learning_rate": 2.356167506173644e-07, + "loss": 0.1718, + "num_tokens": 1275086310.0, + "reward": 1.0390625, + "reward_std": 0.3389974534511566, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.17416280508041382, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.2045886218547821, "step": 2261 }, { @@ -65584,27 +65584,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1934.0, - "completions/mean_length": 767.58984375, - "completions/mean_terminated_length": 728.9456787109375, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1027.71484375, + "completions/mean_terminated_length": 994.8023681640625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, "epoch": 0.7722113168899889, - "grad_norm": 1.3708407878875732, - "kl": 6.8515625, - "learning_rate": 2.3513617574681828e-07, - "loss": 0.4571, - "num_tokens": 1221610312.0, - "reward": 1.814453125, - "reward_std": 0.558542013168335, - "rewards/accuracy_reward/mean": 0.04838709533214569, - "rewards/accuracy_reward/std": 0.21479946374893188, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.20538106560707092, + "grad_norm": 2.281660556793213, + "kl": 2.142578125, + "learning_rate": 2.3523325196874746e-07, + "loss": 0.0752, + "num_tokens": 1275684068.0, + "reward": 1.02783203125, + "reward_std": 0.29623085260391235, + "rewards/accuracy_reward/mean": 0.07459677755832672, + "rewards/accuracy_reward/std": 0.263004869222641, + "rewards/format_reward/mean": 0.02734375, + "rewards/format_reward/std": 0.16324250400066376, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.18131764233112335, "step": 2262 }, { @@ -65613,27 +65613,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1919.0, - "completions/mean_length": 822.24609375, - "completions/mean_terminated_length": 772.4186401367188, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1111.408203125, + "completions/mean_terminated_length": 1092.7509765625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.7725527012033797, - "grad_norm": 1.4335846900939941, - "kl": 6.8203125, - "learning_rate": 2.347533826667302e-07, - "loss": 0.4465, - "num_tokens": 1222102950.0, - "reward": 1.7841796875, - "reward_std": 0.5437231659889221, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.9111328125, - "rewards/tag_count_reward/std": 0.20610326528549194, + "grad_norm": 4.6520209312438965, + "kl": 2.302734375, + "learning_rate": 2.3485020041298544e-07, + "loss": 0.0914, + "num_tokens": 1276324757.0, + "reward": 1.04345703125, + "reward_std": 0.3242158889770508, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.172541543841362, "step": 2263 }, { @@ -65642,27 +65642,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1979.0, - "completions/mean_length": 780.509765625, - "completions/mean_terminated_length": 734.325927734375, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1039.037109375, + "completions/mean_terminated_length": 1014.822021484375, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, "epoch": 0.7728940855167705, - "grad_norm": 1.5141960382461548, - "kl": 6.23828125, - "learning_rate": 2.3437103702155617e-07, - "loss": 0.3974, - "num_tokens": 1222576635.0, - "reward": 1.82568359375, - "reward_std": 0.5265644788742065, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.19901487231254578, + "grad_norm": 3.5061988830566406, + "kl": 2.669921875, + "learning_rate": 2.344675964941627e-07, + "loss": 0.1306, + "num_tokens": 1276930808.0, + "reward": 1.05224609375, + "reward_std": 0.30761483311653137, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.16789913177490234, "step": 2264 }, { @@ -65671,27 +65671,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 716.271484375, - "completions/mean_terminated_length": 681.5771484375, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 966.9765625, + "completions/mean_terminated_length": 943.2415161132812, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.7732354698301613, - "grad_norm": 1.5562620162963867, - "kl": 5.4453125, - "learning_rate": 2.339891393539656e-07, - "loss": 0.3614, - "num_tokens": 1223013670.0, - "reward": 1.91015625, - "reward_std": 0.5030232667922974, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.18489299714565277, + "grad_norm": 4.673521041870117, + "kl": 3.830078125, + "learning_rate": 2.3408544075572727e-07, + "loss": 0.2032, + "num_tokens": 1277496204.0, + "reward": 1.11767578125, + "reward_std": 0.33358681201934814, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.18266178667545319, "step": 2265 }, { @@ -65702,25 +65702,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1862.0, - "completions/mean_length": 770.376953125, - "completions/mean_terminated_length": 744.9263305664062, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1007.16796875, + "completions/mean_terminated_length": 986.4342651367188, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.7735768541435522, - "grad_norm": 1.3630690574645996, - "kl": 4.7421875, - "learning_rate": 2.336076902059927e-07, - "loss": 0.3269, - "num_tokens": 1223480823.0, - "reward": 1.94140625, - "reward_std": 0.4669956564903259, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.900390625, - "rewards/format_reward/std": 0.29977133870124817, - "rewards/tag_count_reward/mean": 0.951171875, - "rewards/tag_count_reward/std": 0.15736515820026398, + "grad_norm": 1.8258031606674194, + "kl": 2.138671875, + "learning_rate": 2.337037337404913e-07, + "loss": 0.0864, + "num_tokens": 1278084594.0, + "reward": 1.11962890625, + "reward_std": 0.3642021715641022, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17460595071315765, "step": 2266 }, { @@ -65729,27 +65729,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1978.0, - "completions/mean_length": 767.064453125, - "completions/mean_terminated_length": 736.322021484375, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 986.375, + "completions/mean_terminated_length": 963.0658569335938, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.7739182384569429, - "grad_norm": 2.2570767402648926, - "kl": 5.08203125, - "learning_rate": 2.3322669011903461e-07, - "loss": 0.3812, - "num_tokens": 1223951048.0, - "reward": 1.92822265625, - "reward_std": 0.5108921527862549, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.16601619124412537, + "grad_norm": 5.637394428253174, + "kl": 2.703125, + "learning_rate": 2.333224759906288e-07, + "loss": 0.1097, + "num_tokens": 1278667106.0, + "reward": 1.091796875, + "reward_std": 0.3354136347770691, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.1781550794839859, "step": 2267 }, { @@ -65758,27 +65758,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1953.0, - "completions/mean_length": 735.439453125, - "completions/mean_terminated_length": 717.2455444335938, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1017.703125, + "completions/mean_terminated_length": 984.4677124023438, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.7742596227703337, - "grad_norm": 1.4364755153656006, - "kl": 4.8203125, - "learning_rate": 2.3284613963385113e-07, - "loss": 0.3042, - "num_tokens": 1224403225.0, - "reward": 1.921875, - "reward_std": 0.5390156507492065, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.18369008600711823, + "grad_norm": 3.8480064868927, + "kl": 2.791015625, + "learning_rate": 2.3294166804767634e-07, + "loss": 0.1872, + "num_tokens": 1279263802.0, + "reward": 1.13720703125, + "reward_std": 0.31603750586509705, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.1834864467382431, "step": 2268 }, { @@ -65787,27 +65787,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1796.0, - "completions/mean_length": 717.837890625, - "completions/mean_terminated_length": 677.692138671875, - "completions/min_length": 77.0, - "completions/min_terminated_length": 77.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 986.7890625, + "completions/mean_terminated_length": 943.650390625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, "epoch": 0.7746010070837245, - "grad_norm": 1.0798249244689941, - "kl": 5.5625, - "learning_rate": 2.3246603929056435e-07, - "loss": 0.3721, - "num_tokens": 1224847286.0, - "reward": 1.9267578125, - "reward_std": 0.49401456117630005, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.18406164646148682, + "grad_norm": 2.9448046684265137, + "kl": 2.755859375, + "learning_rate": 2.3256131045253127e-07, + "loss": 0.145, + "num_tokens": 1279845566.0, + "reward": 1.08984375, + "reward_std": 0.34643620252609253, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.18962833285331726, "step": 2269 }, { @@ -65816,27 +65816,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 749.353515625, - "completions/mean_terminated_length": 712.8453369140625, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1005.92578125, + "completions/mean_terminated_length": 983.0458984375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, "epoch": 0.7749423913971153, - "grad_norm": 3.0914320945739746, - "kl": 4.671875, - "learning_rate": 2.320863896286569e-07, - "loss": 0.3542, - "num_tokens": 1225304939.0, - "reward": 1.97705078125, - "reward_std": 0.5090059638023376, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, - "rewards/format_reward/mean": 0.90625, - "rewards/format_reward/std": 0.29176566004753113, - "rewards/tag_count_reward/mean": 0.94775390625, - "rewards/tag_count_reward/std": 0.17155851423740387, + "grad_norm": 2.3940649032592773, + "kl": 2.41015625, + "learning_rate": 2.3218140374545137e-07, + "loss": 0.1288, + "num_tokens": 1280434584.0, + "reward": 1.14599609375, + "reward_std": 0.3611350655555725, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.17291219532489777, "step": 2270 }, { @@ -65845,27 +65845,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 797.6875, - "completions/mean_terminated_length": 770.2355346679688, - "completions/min_length": 84.0, - "completions/min_terminated_length": 84.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1066.4453125, + "completions/mean_terminated_length": 1042.8880615234375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, "epoch": 0.7752837757105061, - "grad_norm": 2.688586711883545, - "kl": 5.3984375, - "learning_rate": 2.3170719118697228e-07, - "loss": 0.3739, - "num_tokens": 1225789195.0, - "reward": 1.8974609375, - "reward_std": 0.4912329316139221, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.18272781372070312, + "grad_norm": 4.552471160888672, + "kl": 2.107421875, + "learning_rate": 2.3180194846605364e-07, + "loss": 0.133, + "num_tokens": 1281056444.0, + "reward": 1.1162109375, + "reward_std": 0.37534934282302856, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.19635941088199615, "step": 2271 }, { @@ -65874,27 +65874,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1780.0, - "completions/mean_length": 726.333984375, - "completions/mean_terminated_length": 697.3153686523438, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 958.115234375, + "completions/mean_terminated_length": 918.40283203125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, "epoch": 0.7756251600238969, - "grad_norm": 1.342585563659668, - "kl": 4.94140625, - "learning_rate": 2.3132844450371314e-07, - "loss": 0.3113, - "num_tokens": 1226235382.0, - "reward": 1.884765625, - "reward_std": 0.4904659390449524, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.1712302714586258, + "grad_norm": 2.0496935844421387, + "kl": 2.673828125, + "learning_rate": 2.3142294515331437e-07, + "loss": 0.1358, + "num_tokens": 1281621303.0, + "reward": 1.00634765625, + "reward_std": 0.24395737051963806, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.1385180652141571, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17875948548316956, "step": 2272 }, { @@ -65903,27 +65903,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 825.859375, - "completions/mean_terminated_length": 763.1211547851562, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1069.80078125, + "completions/mean_terminated_length": 1034.157958984375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, "epoch": 0.7759665443372877, - "grad_norm": 2.138890027999878, - "kl": 7.7265625, - "learning_rate": 2.3095015011644128e-07, - "loss": 0.4599, - "num_tokens": 1226730526.0, - "reward": 1.822265625, - "reward_std": 0.5565134286880493, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.20674438774585724, + "grad_norm": 3.2112843990325928, + "kl": 2.18359375, + "learning_rate": 2.3104439434556775e-07, + "loss": 0.1065, + "num_tokens": 1282241345.0, + "reward": 1.07080078125, + "reward_std": 0.3248564600944519, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.18481481075286865, "step": 2273 }, { @@ -65932,27 +65932,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1933.0, - "completions/mean_length": 822.09765625, - "completions/mean_terminated_length": 785.0985717773438, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1075.50390625, + "completions/mean_terminated_length": 1031.8408203125, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, "epoch": 0.7763079286506785, - "grad_norm": 0.9268271923065186, - "kl": 5.9375, - "learning_rate": 2.3057230856207633e-07, - "loss": 0.3569, - "num_tokens": 1227230800.0, - "reward": 1.85595703125, - "reward_std": 0.5136551856994629, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.18939071893692017, + "grad_norm": 2.3921453952789307, + "kl": 2.779296875, + "learning_rate": 2.3066629658050482e-07, + "loss": 0.1625, + "num_tokens": 1282871363.0, + "reward": 1.060546875, + "reward_std": 0.32115721702575684, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.19534705579280853, "step": 2274 }, { @@ -65961,27 +65961,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 863.791015625, - "completions/mean_terminated_length": 815.6524047851562, - "completions/min_length": 101.0, - "completions/min_terminated_length": 101.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1098.357421875, + "completions/mean_terminated_length": 1053.691162109375, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, "epoch": 0.7766493129640692, - "grad_norm": 1.4397482872009277, - "kl": 6.6484375, - "learning_rate": 2.3019492037689518e-07, - "loss": 0.3913, - "num_tokens": 1227747429.0, - "reward": 1.82470703125, - "reward_std": 0.5017789602279663, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.1974627822637558, + "grad_norm": 2.4226014614105225, + "kl": 2.333984375, + "learning_rate": 2.3028865239517363e-07, + "loss": 0.1366, + "num_tokens": 1283508090.0, + "reward": 1.02490234375, + "reward_std": 0.28532925248146057, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.18012800812721252, "step": 2275 }, { @@ -65992,25 +65992,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1911.0, - "completions/mean_length": 770.060546875, - "completions/mean_terminated_length": 726.1717529296875, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 998.8984375, + "completions/mean_terminated_length": 962.8687133789062, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.7769906972774601, - "grad_norm": 2.095837116241455, - "kl": 7.59375, - "learning_rate": 2.2981798609653148e-07, - "loss": 0.4539, - "num_tokens": 1228215572.0, - "reward": 1.8271484375, - "reward_std": 0.542022705078125, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.18964596092700958, + "grad_norm": 3.0213632583618164, + "kl": 2.630859375, + "learning_rate": 2.299114623259778e-07, + "loss": 0.1217, + "num_tokens": 1284093398.0, + "reward": 1.09912109375, + "reward_std": 0.30310487747192383, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.92919921875, + "rewards/tag_count_reward/std": 0.19089330732822418, "step": 2276 }, { @@ -66019,27 +66019,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 818.544921875, - "completions/mean_terminated_length": 786.5150146484375, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1072.08984375, + "completions/mean_terminated_length": 1044.654541015625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.7773320815908509, - "grad_norm": 0.8329452872276306, - "kl": 6.234375, - "learning_rate": 2.294415062559743e-07, - "loss": 0.3796, - "num_tokens": 1228705499.0, - "reward": 1.9345703125, - "reward_std": 0.5677171349525452, - "rewards/accuracy_reward/mean": 0.1391129046678543, - "rewards/accuracy_reward/std": 0.3464137017726898, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.18399934470653534, + "grad_norm": 3.269261360168457, + "kl": 2.232421875, + "learning_rate": 2.29534726908676e-07, + "loss": 0.095, + "num_tokens": 1284713140.0, + "reward": 1.1396484375, + "reward_std": 0.3556555509567261, + "rewards/accuracy_reward/mean": 0.15927419066429138, + "rewards/accuracy_reward/std": 0.3663010001182556, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.17448893189430237, "step": 2277 }, { @@ -66048,27 +66048,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 820.166015625, - "completions/mean_terminated_length": 777.9979858398438, - "completions/min_length": 41.0, - "completions/min_terminated_length": 41.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1029.0546875, + "completions/mean_terminated_length": 998.3017578125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, "epoch": 0.7776734659042417, - "grad_norm": 2.3972365856170654, - "kl": 7.7578125, - "learning_rate": 2.2906548138956815e-07, - "loss": 0.4528, - "num_tokens": 1229203488.0, - "reward": 1.810546875, - "reward_std": 0.5649162530899048, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.20733514428138733, + "grad_norm": 2.3558075428009033, + "kl": 1.802734375, + "learning_rate": 2.2915844667838074e-07, + "loss": 0.0651, + "num_tokens": 1285318080.0, + "reward": 0.99853515625, + "reward_std": 0.29774922132492065, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17416280508041382, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.18053071200847626, "step": 2278 }, { @@ -66077,27 +66077,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 824.634765625, - "completions/mean_terminated_length": 785.1713256835938, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1097.9921875, + "completions/mean_terminated_length": 1057.360595703125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.7780148502176325, - "grad_norm": 1.5216360092163086, - "kl": 6.890625, - "learning_rate": 2.2868991203101145e-07, - "loss": 0.4244, - "num_tokens": 1229703557.0, - "reward": 1.8779296875, - "reward_std": 0.5143810510635376, - "rewards/accuracy_reward/mean": 0.07500000298023224, - "rewards/accuracy_reward/std": 0.26366615295410156, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.17660093307495117, + "grad_norm": 2.3282322883605957, + "kl": 2.58984375, + "learning_rate": 2.2878262216955863e-07, + "loss": 0.1496, + "num_tokens": 1285958108.0, + "reward": 1.0380859375, + "reward_std": 0.3391152620315552, + "rewards/accuracy_reward/mean": 0.08541666716337204, + "rewards/accuracy_reward/std": 0.27979233860969543, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.19561834633350372, "step": 2279 }, { @@ -66106,27 +66106,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1906.0, - "completions/mean_length": 782.357421875, - "completions/mean_terminated_length": 736.2409057617188, - "completions/min_length": 46.0, - "completions/min_terminated_length": 46.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1044.720703125, + "completions/mean_terminated_length": 1016.5160522460938, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.7783562345310233, - "grad_norm": 2.0146689414978027, - "kl": 7.171875, - "learning_rate": 2.283147987133561e-07, - "loss": 0.4427, - "num_tokens": 1230185004.0, - "reward": 1.8876953125, - "reward_std": 0.5053796172142029, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.18399934470653534, + "grad_norm": 2.625993013381958, + "kl": 3.01171875, + "learning_rate": 2.2840725391602826e-07, + "loss": 0.1692, + "num_tokens": 1286573885.0, + "reward": 1.06298828125, + "reward_std": 0.32963770627975464, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.19362683594226837, "step": 2280 }, { @@ -66135,27 +66135,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 847.841796875, - "completions/mean_terminated_length": 788.8175659179688, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1108.6484375, + "completions/mean_terminated_length": 1064.4661865234375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.7786976188444141, - "grad_norm": 1.562910795211792, - "kl": 7.0859375, - "learning_rate": 2.2794014196900704e-07, - "loss": 0.4151, - "num_tokens": 1230690843.0, - "reward": 1.85498046875, - "reward_std": 0.5231390595436096, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.20506852865219116, + "grad_norm": 5.202631950378418, + "kl": 2.6171875, + "learning_rate": 2.2803234245096062e-07, + "loss": 0.1557, + "num_tokens": 1287213257.0, + "reward": 1.08203125, + "reward_std": 0.3195320963859558, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.19642995297908783, "step": 2281 }, { @@ -66164,27 +66164,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, - "completions/mean_length": 853.474609375, - "completions/mean_terminated_length": 814.9415283203125, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/mean_length": 1098.8203125, + "completions/mean_terminated_length": 1085.6634521484375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.7790390031578049, - "grad_norm": 1.12908136844635, - "kl": 5.359375, - "learning_rate": 2.275659423297208e-07, - "loss": 0.3564, - "num_tokens": 1231210094.0, - "reward": 1.9248046875, - "reward_std": 0.4759534001350403, + "grad_norm": 5.33516263961792, + "kl": 2.080078125, + "learning_rate": 2.2765788830687782e-07, + "loss": 0.0554, + "num_tokens": 1287858125.0, + "reward": 1.0859375, + "reward_std": 0.3283921778202057, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.9443359375, - "rewards/tag_count_reward/std": 0.17011895775794983, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.1787441074848175, "step": 2282 }, { @@ -66193,27 +66193,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1982.0, - "completions/mean_length": 826.4921875, - "completions/mean_terminated_length": 794.6693725585938, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1059.697265625, + "completions/mean_terminated_length": 1011.0921630859375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, "epoch": 0.7793803874711956, - "grad_norm": 1.2648156881332397, - "kl": 5.3984375, - "learning_rate": 2.2719220032660553e-07, - "loss": 0.3422, - "num_tokens": 1231711818.0, - "reward": 1.8525390625, - "reward_std": 0.49191102385520935, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.18538589775562286, + "grad_norm": 3.056795358657837, + "kl": 2.595703125, + "learning_rate": 2.2728389201565252e-07, + "loss": 0.1649, + "num_tokens": 1288479250.0, + "reward": 1.013671875, + "reward_std": 0.33103084564208984, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.19042283296585083, "step": 2283 }, { @@ -66222,27 +66222,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1940.0, - "completions/mean_length": 771.978515625, - "completions/mean_terminated_length": 728.1555786132812, - "completions/min_length": 47.0, - "completions/min_terminated_length": 47.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 979.302734375, + "completions/mean_terminated_length": 951.4609375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.7797217717845865, - "grad_norm": 1.8786842823028564, - "kl": 5.96875, - "learning_rate": 2.2681891649011942e-07, - "loss": 0.3983, - "num_tokens": 1232181631.0, - "reward": 1.87109375, - "reward_std": 0.5296258330345154, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.18633443117141724, + "grad_norm": 1.7095515727996826, + "kl": 2.18359375, + "learning_rate": 2.269103541085065e-07, + "loss": 0.1105, + "num_tokens": 1289055213.0, + "reward": 1.08837890625, + "reward_std": 0.3720618188381195, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17738577723503113, "step": 2284 }, { @@ -66251,27 +66251,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1772.0, - "completions/mean_length": 828.51171875, - "completions/mean_terminated_length": 794.2288818359375, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1081.056640625, + "completions/mean_terminated_length": 1047.8485107421875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, "epoch": 0.7800631560979773, - "grad_norm": 1.1072237491607666, - "kl": 4.97265625, - "learning_rate": 2.2644609135007088e-07, - "loss": 0.2721, - "num_tokens": 1232684645.0, - "reward": 1.87158203125, - "reward_std": 0.5229383707046509, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.18486134707927704, + "grad_norm": 3.155726432800293, + "kl": 2.6796875, + "learning_rate": 2.2653727511601115e-07, + "loss": 0.1181, + "num_tokens": 1289687530.0, + "reward": 1.06787109375, + "reward_std": 0.3291609287261963, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.16568778455257416, "step": 2285 }, { @@ -66280,27 +66280,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1884.0, - "completions/mean_length": 882.921875, - "completions/mean_terminated_length": 835.5609741210938, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1161.73046875, + "completions/mean_terminated_length": 1110.4586181640625, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, "epoch": 0.7804045404113681, - "grad_norm": 0.6721342206001282, - "kl": 4.5859375, - "learning_rate": 2.2607372543561681e-07, - "loss": 0.2602, - "num_tokens": 1233210765.0, - "reward": 1.8681640625, - "reward_std": 0.44373592734336853, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.18863557279109955, + "grad_norm": 1.8491008281707764, + "kl": 2.83203125, + "learning_rate": 2.261646555680855e-07, + "loss": 0.1321, + "num_tokens": 1290356400.0, + "reward": 1.01513671875, + "reward_std": 0.3437083959579468, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.19850045442581177, "step": 2286 }, { @@ -66309,27 +66309,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1955.0, - "completions/mean_length": 804.482421875, - "completions/mean_terminated_length": 772.086181640625, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1022.39453125, + "completions/mean_terminated_length": 989.3104248046875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.7807459247247589, - "grad_norm": 2.7682130336761475, - "kl": 5.578125, - "learning_rate": 2.257018192752625e-07, - "loss": 0.3786, - "num_tokens": 1233696836.0, - "reward": 1.86962890625, - "reward_std": 0.48753321170806885, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.93994140625, - "rewards/tag_count_reward/std": 0.18553723394870758, + "grad_norm": 1.5820467472076416, + "kl": 2.607421875, + "learning_rate": 2.2579249599399616e-07, + "loss": 0.1441, + "num_tokens": 1290954042.0, + "reward": 1.0400390625, + "reward_std": 0.3252141773700714, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.1848077028989792, "step": 2287 }, { @@ -66338,27 +66338,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1892.0, - "completions/mean_length": 881.515625, - "completions/mean_terminated_length": 831.6253051757812, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1030.3984375, + "completions/mean_terminated_length": 1010.1275024414062, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.7810873090381497, - "grad_norm": 1.2741656303405762, - "kl": 6.0234375, - "learning_rate": 2.2533037339686085e-07, - "loss": 0.3644, - "num_tokens": 1234221676.0, - "reward": 1.81201171875, - "reward_std": 0.5945438146591187, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.19697342813014984, + "grad_norm": 1.740046501159668, + "kl": 1.87890625, + "learning_rate": 2.254207969223566e-07, + "loss": 0.0714, + "num_tokens": 1291555110.0, + "reward": 1.076171875, + "reward_std": 0.319326788187027, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.1726529598236084, "step": 2288 }, { @@ -66367,27 +66367,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 837.576171875, - "completions/mean_terminated_length": 798.5302124023438, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1081.099609375, + "completions/mean_terminated_length": 1039.7454833984375, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, "epoch": 0.7814286933515405, - "grad_norm": 1.164945125579834, - "kl": 5.24609375, - "learning_rate": 2.2495938832761114e-07, - "loss": 0.3163, - "num_tokens": 1234734179.0, - "reward": 1.8564453125, - "reward_std": 0.47739219665527344, - "rewards/accuracy_reward/mean": 0.05443548411130905, - "rewards/accuracy_reward/std": 0.227104052901268, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.1765792965888977, + "grad_norm": 1.753785252571106, + "kl": 2.3359375, + "learning_rate": 2.25049558881126e-07, + "loss": 0.1101, + "num_tokens": 1292192297.0, + "reward": 1.05322265625, + "reward_std": 0.3315582275390625, + "rewards/accuracy_reward/mean": 0.07056451588869095, + "rewards/accuracy_reward/std": 0.25635457038879395, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.18481481075286865, "step": 2289 }, { @@ -66396,27 +66396,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 856.666015625, - "completions/mean_terminated_length": 808.23779296875, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1107.896484375, + "completions/mean_terminated_length": 1047.3077392578125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.7817700776649313, - "grad_norm": 1.6337648630142212, - "kl": 6.9453125, - "learning_rate": 2.245888645940591e-07, - "loss": 0.3918, - "num_tokens": 1235254088.0, - "reward": 1.81005859375, - "reward_std": 0.5663132667541504, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.20802472531795502, + "grad_norm": 2.1008362770080566, + "kl": 2.076171875, + "learning_rate": 2.2467878239760851e-07, + "loss": 0.1088, + "num_tokens": 1292840836.0, + "reward": 1.041015625, + "reward_std": 0.35410812497138977, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.1937359869480133, "step": 2290 }, { @@ -66425,27 +66425,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 841.869140625, - "completions/mean_terminated_length": 797.9210815429688, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1026.337890625, + "completions/mean_terminated_length": 993.3810424804688, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, "epoch": 0.782111461978322, - "grad_norm": 1.6514548063278198, - "kl": 7.3046875, - "learning_rate": 2.2421880272209524e-07, - "loss": 0.4617, - "num_tokens": 1235755253.0, - "reward": 1.92236328125, - "reward_std": 0.5728150010108948, - "rewards/accuracy_reward/mean": 0.146484375, - "rewards/accuracy_reward/std": 0.35393697023391724, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.19486647844314575, + "grad_norm": 2.446251392364502, + "kl": 2.388671875, + "learning_rate": 2.243084679984531e-07, + "loss": 0.1456, + "num_tokens": 1293436449.0, + "reward": 1.14697265625, + "reward_std": 0.35229337215423584, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39069411158561707, + "rewards/format_reward/mean": 0.029296875, + "rewards/format_reward/std": 0.16880230605602264, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.17731572687625885, "step": 2291 }, { @@ -66454,27 +66454,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1950.0, - "completions/mean_length": 823.353515625, - "completions/mean_terminated_length": 768.369384765625, - "completions/min_length": 77.0, - "completions/min_terminated_length": 77.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1044.359375, + "completions/mean_terminated_length": 994.9999389648438, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, "epoch": 0.7824528462917129, - "grad_norm": 1.2169692516326904, - "kl": 8.0, - "learning_rate": 2.238492032369546e-07, - "loss": 0.4705, - "num_tokens": 1236258218.0, - "reward": 1.7724609375, - "reward_std": 0.6060072183609009, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.8046875, - "rewards/format_reward/std": 0.3968288004398346, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.21200865507125854, + "grad_norm": 3.2603793144226074, + "kl": 2.16796875, + "learning_rate": 2.2393861620965205e-07, + "loss": 0.0739, + "num_tokens": 1294052569.0, + "reward": 1.0244140625, + "reward_std": 0.32584571838378906, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.1872730702161789, "step": 2292 }, { @@ -66483,27 +66483,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 793.775390625, - "completions/mean_terminated_length": 745.4381103515625, - "completions/min_length": 42.0, - "completions/min_terminated_length": 42.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1023.06640625, + "completions/mean_terminated_length": 957.0103759765625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.7827942306051037, - "grad_norm": 0.8538448214530945, - "kl": 6.5078125, - "learning_rate": 2.2348006666321633e-07, - "loss": 0.4075, - "num_tokens": 1236745143.0, - "reward": 1.8251953125, - "reward_std": 0.520323634147644, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.18835169076919556, + "grad_norm": 3.8458852767944336, + "kl": 2.33984375, + "learning_rate": 2.2356922755654068e-07, + "loss": 0.1037, + "num_tokens": 1294656891.0, + "reward": 1.076171875, + "reward_std": 0.3515157103538513, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.2045886218547821, "step": 2293 }, { @@ -66512,27 +66512,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 900.056640625, - "completions/mean_terminated_length": 846.0633544921875, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1038.7109375, + "completions/mean_terminated_length": 1014.488037109375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.7831356149184945, - "grad_norm": 1.750657558441162, - "kl": 7.5, - "learning_rate": 2.2311139352480196e-07, - "loss": 0.4444, - "num_tokens": 1237275332.0, - "reward": 1.8310546875, - "reward_std": 0.5770972371101379, - "rewards/accuracy_reward/mean": 0.10282257944345474, - "rewards/accuracy_reward/std": 0.30403366684913635, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.9111328125, - "rewards/tag_count_reward/std": 0.21021628379821777, + "grad_norm": 2.5841803550720215, + "kl": 2.0625, + "learning_rate": 2.2320030256379656e-07, + "loss": 0.1127, + "num_tokens": 1295258071.0, + "reward": 1.1044921875, + "reward_std": 0.3568401336669922, + "rewards/accuracy_reward/mean": 0.11693548411130905, + "rewards/accuracy_reward/std": 0.3216678202152252, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.1672183722257614, "step": 2294 }, { @@ -66541,27 +66541,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 924.345703125, - "completions/mean_terminated_length": 861.7918090820312, - "completions/min_length": 41.0, - "completions/min_terminated_length": 41.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1065.115234375, + "completions/mean_terminated_length": 1045.535888671875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, "epoch": 0.7834769992318853, - "grad_norm": 0.9420157670974731, - "kl": 7.33203125, - "learning_rate": 2.227431843449759e-07, - "loss": 0.4393, - "num_tokens": 1237825605.0, - "reward": 1.78564453125, - "reward_std": 0.5860618352890015, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.2085477113723755, + "grad_norm": 3.0170798301696777, + "kl": 1.67578125, + "learning_rate": 2.2283184175543867e-07, + "loss": 0.072, + "num_tokens": 1295880418.0, + "reward": 1.02783203125, + "reward_std": 0.27887699007987976, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.2029850035905838, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.94384765625, + "rewards/tag_count_reward/std": 0.15529228746891022, "step": 2295 }, { @@ -66570,27 +66570,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 832.921875, - "completions/mean_terminated_length": 788.6477661132812, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1029.1796875, + "completions/mean_terminated_length": 989.914794921875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.7838183835452761, - "grad_norm": 1.458985447883606, - "kl": 5.6484375, - "learning_rate": 2.2237543964634343e-07, - "loss": 0.3199, - "num_tokens": 1238330461.0, - "reward": 1.85546875, - "reward_std": 0.4921872913837433, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.186283141374588, + "grad_norm": 1.7091035842895508, + "kl": 2.244140625, + "learning_rate": 2.2246384565482645e-07, + "loss": 0.1335, + "num_tokens": 1296485758.0, + "reward": 1.06396484375, + "reward_std": 0.31465262174606323, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.94091796875, + "rewards/tag_count_reward/std": 0.17147494852542877, "step": 2296 }, { @@ -66599,27 +66599,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 841.6640625, - "completions/mean_terminated_length": 777.1275634765625, - "completions/min_length": 50.0, - "completions/min_terminated_length": 50.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1051.97265625, + "completions/mean_terminated_length": 1015.68017578125, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, "epoch": 0.7841597678586669, - "grad_norm": 1.8807940483093262, - "kl": 6.3671875, - "learning_rate": 2.2200815995085132e-07, - "loss": 0.4228, - "num_tokens": 1238840401.0, - "reward": 1.84130859375, - "reward_std": 0.6311179995536804, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.21251004934310913, + "grad_norm": 3.524322748184204, + "kl": 2.892578125, + "learning_rate": 2.220963147846595e-07, + "loss": 0.1858, + "num_tokens": 1297103376.0, + "reward": 1.15673828125, + "reward_std": 0.3673393726348877, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3810062110424042, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.18729029595851898, "step": 2297 }, { @@ -66628,27 +66628,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1982.0, - "completions/mean_length": 853.55078125, - "completions/mean_terminated_length": 797.3701171875, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1058.099609375, + "completions/mean_terminated_length": 1028.2232666015625, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, "epoch": 0.7845011521720577, - "grad_norm": 2.3324904441833496, - "kl": 5.7265625, - "learning_rate": 2.2164134577978528e-07, - "loss": 0.3718, - "num_tokens": 1239363915.0, - "reward": 1.8291015625, - "reward_std": 0.5720508694648743, - "rewards/accuracy_reward/mean": 0.06854838877916336, - "rewards/accuracy_reward/std": 0.25293970108032227, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.18964596092700958, + "grad_norm": 4.519505023956299, + "kl": 1.904296875, + "learning_rate": 2.217292496669764e-07, + "loss": 0.0765, + "num_tokens": 1297731619.0, + "reward": 1.1044921875, + "reward_std": 0.3536537289619446, + "rewards/accuracy_reward/mean": 0.10080645233392715, + "rewards/accuracy_reward/std": 0.30137622356414795, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.9462890625, + "rewards/tag_count_reward/std": 0.15964092314243317, "step": 2298 }, { @@ -66657,27 +66657,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, - "completions/mean_length": 917.06640625, - "completions/mean_terminated_length": 868.6965942382812, - "completions/min_length": 184.0, - "completions/min_terminated_length": 184.0, + "completions/mean_length": 1104.576171875, + "completions/mean_terminated_length": 1054.1048583984375, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, "epoch": 0.7848425364854484, - "grad_norm": 2.211033582687378, - "kl": 5.43359375, - "learning_rate": 2.2127499765377133e-07, - "loss": 0.3338, - "num_tokens": 1239910349.0, - "reward": 1.8466796875, - "reward_std": 0.5793843269348145, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.2049875557422638, + "grad_norm": 2.1584033966064453, + "kl": 1.5283203125, + "learning_rate": 2.2136265082315442e-07, + "loss": 0.096, + "num_tokens": 1298374058.0, + "reward": 1.11474609375, + "reward_std": 0.3629153370857239, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.16789913177490234, "step": 2299 }, { @@ -66686,27 +66686,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 935.25, - "completions/mean_terminated_length": 870.8759765625, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1123.92578125, + "completions/mean_terminated_length": 1084.4033203125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.7851839207988393, - "grad_norm": 1.2065584659576416, - "kl": 5.4296875, - "learning_rate": 2.2090911609277375e-07, - "loss": 0.3338, - "num_tokens": 1240469373.0, - "reward": 1.80859375, - "reward_std": 0.5164964199066162, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.20555779337882996, + "grad_norm": 4.45211124420166, + "kl": 2.037109375, + "learning_rate": 2.209965187739084e-07, + "loss": 0.1143, + "num_tokens": 1299029684.0, + "reward": 1.06103515625, + "reward_std": 0.3254391551017761, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.1814756840467453, "step": 2300 }, { @@ -66715,27 +66715,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 872.611328125, - "completions/mean_terminated_length": 819.8387451171875, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1082.166015625, + "completions/mean_terminated_length": 1036.7381591796875, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, "epoch": 0.7855253051122301, - "grad_norm": 2.977849006652832, - "kl": 6.109375, - "learning_rate": 2.2054370161609415e-07, - "loss": 0.4412, - "num_tokens": 1240997430.0, - "reward": 1.853515625, - "reward_std": 0.5015002489089966, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.18889120221138, + "grad_norm": 6.068052768707275, + "kl": 2.330078125, + "learning_rate": 2.2063085403929024e-07, + "loss": 0.1714, + "num_tokens": 1299665033.0, + "reward": 1.04931640625, + "reward_std": 0.3271487355232239, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.92236328125, + "rewards/tag_count_reward/std": 0.19334039092063904, "step": 2301 }, { @@ -66744,27 +66744,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 923.1171875, - "completions/mean_terminated_length": 850.6195678710938, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1063.6171875, + "completions/mean_terminated_length": 1021.5153198242188, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, "epoch": 0.7858666894256209, - "grad_norm": 2.0174193382263184, - "kl": 7.0078125, - "learning_rate": 2.201787547423719e-07, - "loss": 0.461, - "num_tokens": 1241545890.0, - "reward": 1.8154296875, - "reward_std": 0.5970208644866943, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.8046875, - "rewards/format_reward/std": 0.3968288004398346, - "rewards/tag_count_reward/mean": 0.9111328125, - "rewards/tag_count_reward/std": 0.2072867453098297, + "grad_norm": 6.554154872894287, + "kl": 1.5234375, + "learning_rate": 2.2026565713868782e-07, + "loss": 0.1115, + "num_tokens": 1300285429.0, + "reward": 1.14111328125, + "reward_std": 0.3114096522331238, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.1638319194316864, "step": 2302 }, { @@ -66773,27 +66773,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 898.25390625, - "completions/mean_terminated_length": 858.7677001953125, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1112.982421875, + "completions/mean_terminated_length": 1072.991943359375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.7862080737390117, - "grad_norm": 0.9348337054252625, - "kl": 5.7734375, - "learning_rate": 2.19814275989582e-07, - "loss": 0.3434, - "num_tokens": 1242082756.0, - "reward": 1.845703125, - "reward_std": 0.5673692226409912, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.20332364737987518, + "grad_norm": 1.6721909046173096, + "kl": 2.068359375, + "learning_rate": 2.1990092859082492e-07, + "loss": 0.1227, + "num_tokens": 1300932236.0, + "reward": 1.07666015625, + "reward_std": 0.3702687621116638, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.18705546855926514, "step": 2303 }, { @@ -66802,27 +66802,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 868.435546875, - "completions/mean_terminated_length": 825.4555053710938, - "completions/min_length": 114.0, - "completions/min_terminated_length": 114.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1032.6953125, + "completions/mean_terminated_length": 993.56591796875, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, "epoch": 0.7865494580524025, - "grad_norm": 1.2880990505218506, - "kl": 6.515625, - "learning_rate": 2.1945026587503578e-07, - "loss": 0.3942, - "num_tokens": 1242602003.0, - "reward": 1.8369140625, - "reward_std": 0.5780290365219116, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.20294499397277832, + "grad_norm": 4.0513410568237305, + "kl": 2.310546875, + "learning_rate": 2.1953666891375966e-07, + "loss": 0.1194, + "num_tokens": 1301535584.0, + "reward": 1.11865234375, + "reward_std": 0.38107720017433167, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.188277468085289, "step": 2304 }, { @@ -66833,25 +66833,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 768.921875, - "completions/mean_terminated_length": 727.6612548828125, - "completions/min_length": 114.0, - "completions/min_terminated_length": 114.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 969.80078125, + "completions/mean_terminated_length": 935.0201416015625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.7868908423657933, - "grad_norm": 1.5181031227111816, - "kl": 7.3359375, - "learning_rate": 2.1908672491537854e-07, - "loss": 0.4568, - "num_tokens": 1243069867.0, - "reward": 1.87353515625, - "reward_std": 0.5800575613975525, - "rewards/accuracy_reward/mean": 0.1088709682226181, - "rewards/accuracy_reward/std": 0.31179171800613403, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.19180715084075928, + "grad_norm": 5.8314528465271, + "kl": 2.83203125, + "learning_rate": 2.1917287862488438e-07, + "loss": 0.1991, + "num_tokens": 1302106298.0, + "reward": 1.08984375, + "reward_std": 0.3768289089202881, + "rewards/accuracy_reward/mean": 0.1270161271095276, + "rewards/accuracy_reward/std": 0.3333272337913513, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.18945692479610443, "step": 2305 }, { @@ -66860,27 +66860,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 797.25390625, - "completions/mean_terminated_length": 772.3386840820312, - "completions/min_length": 15.0, - "completions/min_terminated_length": 15.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1009.349609375, + "completions/mean_terminated_length": 980.1505737304688, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, "epoch": 0.7872322266791841, - "grad_norm": 1.515105128288269, - "kl": 6.1640625, - "learning_rate": 2.187236536265904e-07, - "loss": 0.3612, - "num_tokens": 1243555197.0, - "reward": 1.7978515625, - "reward_std": 0.5367641448974609, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.1922481507062912, + "grad_norm": 2.5602266788482666, + "kl": 1.931640625, + "learning_rate": 2.1880955824092473e-07, + "loss": 0.1012, + "num_tokens": 1302700221.0, + "reward": 1.09521484375, + "reward_std": 0.34862062335014343, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.17007049918174744, "step": 2306 }, { @@ -66889,27 +66889,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 821.015625, - "completions/mean_terminated_length": 763.3046875, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1046.978515625, + "completions/mean_terminated_length": 1022.9540405273438, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, "epoch": 0.787573610992575, - "grad_norm": 1.7140458822250366, - "kl": 7.6171875, - "learning_rate": 2.1836105252398483e-07, - "loss": 0.4465, - "num_tokens": 1244053477.0, - "reward": 1.83837890625, - "reward_std": 0.6304191946983337, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.90869140625, - "rewards/tag_count_reward/std": 0.2117803692817688, + "grad_norm": 7.343836784362793, + "kl": 1.974609375, + "learning_rate": 2.1844670827793898e-07, + "loss": 0.075, + "num_tokens": 1303314194.0, + "reward": 1.201171875, + "reward_std": 0.3874884843826294, + "rewards/accuracy_reward/mean": 0.173828125, + "rewards/accuracy_reward/std": 0.3793322443962097, + "rewards/format_reward/mean": 0.080078125, + "rewards/format_reward/std": 0.271679550409317, + "rewards/tag_count_reward/mean": 0.947265625, + "rewards/tag_count_reward/std": 0.1537284255027771, "step": 2307 }, { @@ -66918,27 +66918,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1978.0, - "completions/mean_length": 844.384765625, - "completions/mean_terminated_length": 820.4083862304688, - "completions/min_length": 29.0, - "completions/min_terminated_length": 29.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1106.8828125, + "completions/mean_terminated_length": 1078.4788818359375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.7879149953059656, - "grad_norm": 1.7330269813537598, - "kl": 4.1875, - "learning_rate": 2.1799892212220745e-07, - "loss": 0.2406, - "num_tokens": 1244564410.0, - "reward": 1.92919921875, - "reward_std": 0.5259957313537598, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.94287109375, - "rewards/tag_count_reward/std": 0.16708904504776, + "grad_norm": 2.5623719692230225, + "kl": 2.388671875, + "learning_rate": 2.1808432925131686e-07, + "loss": 0.1297, + "num_tokens": 1303959526.0, + "reward": 1.08837890625, + "reward_std": 0.34951961040496826, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.18266178667545319, "step": 2308 }, { @@ -66947,27 +66947,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1934.0, - "completions/mean_length": 832.71875, - "completions/mean_terminated_length": 778.1550903320312, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1052.9609375, + "completions/mean_terminated_length": 1020.8628540039062, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, "epoch": 0.7882563796193565, - "grad_norm": 2.6183900833129883, - "kl": 6.5234375, - "learning_rate": 2.1763726293523642e-07, - "loss": 0.437, - "num_tokens": 1245066602.0, - "reward": 1.8818359375, - "reward_std": 0.5811901092529297, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.19773660600185394, + "grad_norm": 2.1872973442077637, + "kl": 2.3203125, + "learning_rate": 2.1772242167577973e-07, + "loss": 0.1176, + "num_tokens": 1304574482.0, + "reward": 1.13037109375, + "reward_std": 0.36306798458099365, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.17220339179039001, "step": 2309 }, { @@ -66976,27 +66976,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1974.0, - "completions/mean_length": 810.8046875, - "completions/mean_terminated_length": 778.5731811523438, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1013.720703125, + "completions/mean_terminated_length": 958.3888549804688, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.7885977639327473, - "grad_norm": 2.6835498809814453, - "kl": 5.421875, - "learning_rate": 2.1727607547638073e-07, - "loss": 0.324, - "num_tokens": 1245556326.0, - "reward": 1.89013671875, - "reward_std": 0.5729560852050781, - "rewards/accuracy_reward/mean": 0.12109375, - "rewards/accuracy_reward/std": 0.3265552520751953, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.18625685572624207, + "grad_norm": 5.431679725646973, + "kl": 2.291015625, + "learning_rate": 2.1736098606537867e-07, + "loss": 0.0968, + "num_tokens": 1305168099.0, + "reward": 1.1376953125, + "reward_std": 0.3144841194152832, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.9443359375, + "rewards/tag_count_reward/std": 0.16352032124996185, "step": 2310 }, { @@ -67005,27 +67005,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1976.0, - "completions/mean_length": 811.447265625, - "completions/mean_terminated_length": 768.9798583984375, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1031.91015625, + "completions/mean_terminated_length": 992.75048828125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, "epoch": 0.7889391482461381, - "grad_norm": 1.9135109186172485, - "kl": 5.8515625, - "learning_rate": 2.1691536025827982e-07, - "loss": 0.3853, - "num_tokens": 1246048139.0, - "reward": 1.90234375, - "reward_std": 0.5492792129516602, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.9296875, - "rewards/tag_count_reward/std": 0.1774672269821167, + "grad_norm": 4.95039701461792, + "kl": 1.89453125, + "learning_rate": 2.170000229334949e-07, + "loss": 0.1307, + "num_tokens": 1305772789.0, + "reward": 1.109375, + "reward_std": 0.35983261466026306, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.18235354125499725, "step": 2311 }, { @@ -67034,27 +67034,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1816.0, - "completions/mean_length": 805.12890625, - "completions/mean_terminated_length": 754.6056518554688, - "completions/min_length": 50.0, - "completions/min_terminated_length": 50.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1053.01171875, + "completions/mean_terminated_length": 993.271240234375, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, "epoch": 0.7892805325595289, - "grad_norm": 2.771057367324829, - "kl": 4.82421875, - "learning_rate": 2.1655511779290285e-07, - "loss": 0.3316, - "num_tokens": 1246543021.0, - "reward": 1.88232421875, - "reward_std": 0.44993066787719727, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.94677734375, - "rewards/tag_count_reward/std": 0.16170679032802582, + "grad_norm": 3.095932722091675, + "kl": 2.80078125, + "learning_rate": 2.1663953279283828e-07, + "loss": 0.1764, + "num_tokens": 1306394587.0, + "reward": 1.0498046875, + "reward_std": 0.3414396643638611, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.9189453125, + "rewards/tag_count_reward/std": 0.1972721517086029, "step": 2312 }, { @@ -67063,27 +67063,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1768.0, - "completions/mean_length": 797.34765625, - "completions/mean_terminated_length": 754.39599609375, - "completions/min_length": 55.0, - "completions/min_terminated_length": 55.0, - "epoch": 0.7896219168729197, - "grad_norm": 2.1502861976623535, - "kl": 5.30078125, - "learning_rate": 2.161953485915483e-07, - "loss": 0.3465, - "num_tokens": 1247032655.0, - "reward": 1.873046875, - "reward_std": 0.4173519015312195, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.94921875, - "rewards/tag_count_reward/std": 0.1605987250804901, + "completions/max_terminated_length": 1940.0, + "completions/mean_length": 1027.927734375, + "completions/mean_terminated_length": 984.2994384765625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.7896219168729197, + "grad_norm": 4.1225409507751465, + "kl": 2.720703125, + "learning_rate": 2.1627951615544716e-07, + "loss": 0.1844, + "num_tokens": 1307002278.0, + "reward": 0.9912109375, + "reward_std": 0.276045024394989, + "rewards/accuracy_reward/mean": 0.037109375, + "rewards/accuracy_reward/std": 0.18921469151973724, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.17416280508041382, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.18612663447856903, "step": 2313 }, { @@ -67092,27 +67092,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 782.4375, - "completions/mean_terminated_length": 754.6506958007812, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1011.55078125, + "completions/mean_terminated_length": 975.9556274414062, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, "epoch": 0.7899633011863105, - "grad_norm": 1.537009596824646, - "kl": 5.828125, - "learning_rate": 2.1583605316484286e-07, - "loss": 0.3451, - "num_tokens": 1247508991.0, - "reward": 1.8583984375, - "reward_std": 0.5327895283699036, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.9306640625, - "rewards/tag_count_reward/std": 0.1799030750989914, + "grad_norm": 3.1654531955718994, + "kl": 2.189453125, + "learning_rate": 2.1591997353268666e-07, + "loss": 0.1291, + "num_tokens": 1307595920.0, + "reward": 1.1044921875, + "reward_std": 0.2975696623325348, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.1709705889225006, "step": 2314 }, { @@ -67121,27 +67121,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1718.0, - "completions/mean_length": 759.458984375, - "completions/mean_terminated_length": 723.2349243164062, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1014.55859375, + "completions/mean_terminated_length": 976.90283203125, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, "epoch": 0.7903046854997013, - "grad_norm": 2.1879611015319824, - "kl": 6.203125, - "learning_rate": 2.1547723202274039e-07, - "loss": 0.3981, - "num_tokens": 1247968394.0, - "reward": 1.83056640625, - "reward_std": 0.5375639200210571, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.18835864961147308, + "grad_norm": 2.0889272689819336, + "kl": 2.58984375, + "learning_rate": 2.155609054352494e-07, + "loss": 0.1258, + "num_tokens": 1308185934.0, + "reward": 1.0654296875, + "reward_std": 0.32039761543273926, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.18308307230472565, "step": 2315 }, { @@ -67150,27 +67150,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 804.03515625, - "completions/mean_terminated_length": 758.70849609375, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1124.099609375, + "completions/mean_terminated_length": 1092.3697509765625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, "epoch": 0.790646069813092, - "grad_norm": 10.085799217224121, - "kl": 9.140625, - "learning_rate": 2.1511888567452224e-07, - "loss": 0.5064, - "num_tokens": 1248457676.0, - "reward": 1.7333984375, - "reward_std": 0.6078810691833496, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.8916015625, - "rewards/tag_count_reward/std": 0.2308577001094818, + "grad_norm": 2.1945366859436035, + "kl": 1.94140625, + "learning_rate": 2.1520231237315326e-07, + "loss": 0.1129, + "num_tokens": 1308839089.0, + "reward": 1.033203125, + "reward_std": 0.3249098062515259, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.1789150983095169, "step": 2316 }, { @@ -67179,27 +67179,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1905.0, - "completions/mean_length": 755.626953125, - "completions/mean_terminated_length": 711.242431640625, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1029.14453125, + "completions/mean_terminated_length": 1004.6920776367188, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, "epoch": 0.7909874541264829, - "grad_norm": 3.328634262084961, - "kl": 6.3984375, - "learning_rate": 2.1476101462879504e-07, - "loss": 0.3507, - "num_tokens": 1248919309.0, - "reward": 1.8779296875, - "reward_std": 0.5386531949043274, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.9287109375, - "rewards/tag_count_reward/std": 0.18910102546215057, + "grad_norm": 5.610980033874512, + "kl": 2.634765625, + "learning_rate": 2.1484419485574196e-07, + "loss": 0.1276, + "num_tokens": 1309440763.0, + "reward": 1.15380859375, + "reward_std": 0.402589350938797, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.17849735915660858, "step": 2317 }, { @@ -67208,27 +67208,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 723.43359375, - "completions/mean_terminated_length": 688.9258422851562, - "completions/min_length": 74.0, - "completions/min_terminated_length": 74.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 941.404296875, + "completions/mean_terminated_length": 914.8460693359375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.7913288384398737, - "grad_norm": 2.646514892578125, - "kl": 6.671875, - "learning_rate": 2.1440361939349166e-07, - "loss": 0.396, - "num_tokens": 1249369083.0, - "reward": 1.87353515625, - "reward_std": 0.5484198927879333, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.18770819902420044, + "grad_norm": 5.959605693817139, + "kl": 2.880859375, + "learning_rate": 2.1448655339168347e-07, + "loss": 0.1422, + "num_tokens": 1310002138.0, + "reward": 1.12646484375, + "reward_std": 0.3662912845611572, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.17706766724586487, "step": 2318 }, { @@ -67237,27 +67237,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 817.1015625, - "completions/mean_terminated_length": 777.3951416015625, - "completions/min_length": 82.0, - "completions/min_terminated_length": 82.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1102.9609375, + "completions/mean_terminated_length": 1058.51123046875, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, "epoch": 0.7916702227532645, - "grad_norm": 2.307469129562378, - "kl": 6.171875, - "learning_rate": 2.1404670047586905e-07, - "loss": 0.3821, - "num_tokens": 1249868527.0, - "reward": 1.857421875, - "reward_std": 0.4760078191757202, - "rewards/accuracy_reward/mean": 0.04838709533214569, - "rewards/accuracy_reward/std": 0.21479946374893188, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.18832378089427948, + "grad_norm": 2.0184543132781982, + "kl": 2.953125, + "learning_rate": 2.141293884889699e-07, + "loss": 0.1385, + "num_tokens": 1310647942.0, + "reward": 1.05810546875, + "reward_std": 0.3813610076904297, + "rewards/accuracy_reward/mean": 0.08467742055654526, + "rewards/accuracy_reward/std": 0.278682142496109, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.20145341753959656, "step": 2319 }, { @@ -67266,27 +67266,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1766.0, - "completions/mean_length": 766.43359375, - "completions/mean_terminated_length": 733.0460815429688, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1062.466796875, + "completions/mean_terminated_length": 1022.4044189453125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, "epoch": 0.7920116070666553, - "grad_norm": 2.878211498260498, - "kl": 5.5390625, - "learning_rate": 2.1369025838250847e-07, - "loss": 0.359, - "num_tokens": 1250326109.0, - "reward": 1.9375, - "reward_std": 0.497491717338562, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.16899466514587402, + "grad_norm": 2.302642345428467, + "kl": 2.275390625, + "learning_rate": 2.137727006549157e-07, + "loss": 0.1232, + "num_tokens": 1311257093.0, + "reward": 1.12548828125, + "reward_std": 0.3347333073616028, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.16896003484725952, "step": 2320 }, { @@ -67295,27 +67295,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 790.7578125, - "completions/mean_terminated_length": 763.1536865234375, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1042.30078125, + "completions/mean_terminated_length": 1007.7616577148438, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, "epoch": 0.7923529913800461, - "grad_norm": 0.9542919993400574, - "kl": 4.8671875, - "learning_rate": 2.1333429361931412e-07, - "loss": 0.3083, - "num_tokens": 1250808753.0, - "reward": 1.93359375, - "reward_std": 0.4397716522216797, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.947265625, - "rewards/tag_count_reward/std": 0.16743822395801544, + "grad_norm": 2.98958420753479, + "kl": 3.505859375, + "learning_rate": 2.1341649039615865e-07, + "loss": 0.2133, + "num_tokens": 1311868527.0, + "reward": 1.08447265625, + "reward_std": 0.3495829701423645, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.92236328125, + "rewards/tag_count_reward/std": 0.19079317152500153, "step": 2321 }, { @@ -67324,27 +67324,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 752.78515625, - "completions/mean_terminated_length": 732.2262573242188, - "completions/min_length": 114.0, - "completions/min_terminated_length": 114.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1060.400390625, + "completions/mean_terminated_length": 1024.4150390625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, "epoch": 0.7926943756934369, - "grad_norm": 1.120664358139038, - "kl": 4.78125, - "learning_rate": 2.1297880669151315e-07, - "loss": 0.2742, - "num_tokens": 1251270467.0, - "reward": 1.859375, - "reward_std": 0.41783279180526733, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.94921875, - "rewards/tag_count_reward/std": 0.15829749405384064, + "grad_norm": 2.5862016677856445, + "kl": 2.158203125, + "learning_rate": 2.1306075821865737e-07, + "loss": 0.1213, + "num_tokens": 1312487740.0, + "reward": 1.07666015625, + "reward_std": 0.32022154331207275, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.94580078125, + "rewards/tag_count_reward/std": 0.15985849499702454, "step": 2322 }, { @@ -67353,27 +67353,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 746.47265625, - "completions/mean_terminated_length": 707.1911010742188, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 980.69140625, + "completions/mean_terminated_length": 957.2575073242188, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.7930357600068277, - "grad_norm": 1.0932191610336304, - "kl": 4.9765625, - "learning_rate": 2.1262379810365404e-07, - "loss": 0.2948, - "num_tokens": 1251723477.0, - "reward": 1.9033203125, - "reward_std": 0.45974022150039673, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.17094823718070984, + "grad_norm": 1.9269840717315674, + "kl": 2.63671875, + "learning_rate": 2.1270550462769212e-07, + "loss": 0.1602, + "num_tokens": 1313060670.0, + "reward": 1.0947265625, + "reward_std": 0.33442410826683044, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.037109375, + "rewards/format_reward/std": 0.18921469151973724, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.1750793755054474, "step": 2323 }, { @@ -67382,27 +67382,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 775.20703125, - "completions/mean_terminated_length": 736.792724609375, - "completions/min_length": 82.0, - "completions/min_terminated_length": 82.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1011.4609375, + "completions/mean_terminated_length": 967.1283569335938, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.7933771443202184, - "grad_norm": 1.393788456916809, - "kl": 5.609375, - "learning_rate": 2.122692683596064e-07, - "loss": 0.3482, - "num_tokens": 1252198703.0, - "reward": 1.90283203125, - "reward_std": 0.5502181053161621, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.19379454851150513, + "grad_norm": 1.5138392448425293, + "kl": 2.67578125, + "learning_rate": 2.1235073012786253e-07, + "loss": 0.144, + "num_tokens": 1313656858.0, + "reward": 1.13330078125, + "reward_std": 0.3676219582557678, + "rewards/accuracy_reward/mean": 0.169921875, + "rewards/accuracy_reward/std": 0.3759314715862274, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.19172243773937225, "step": 2324 }, { @@ -67411,27 +67411,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, - "completions/mean_length": 762.34765625, - "completions/mean_terminated_length": 723.5452270507812, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/mean_length": 1059.46875, + "completions/mean_terminated_length": 995.7588500976562, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.7937185286336093, - "grad_norm": 2.35156512260437, - "kl": 5.890625, - "learning_rate": 2.1191521796256067e-07, - "loss": 0.4083, - "num_tokens": 1252664241.0, - "reward": 1.86328125, - "reward_std": 0.49813130497932434, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.18036192655563354, + "grad_norm": 4.874662399291992, + "kl": 3.97265625, + "learning_rate": 2.119964352230888e-07, + "loss": 0.2157, + "num_tokens": 1314274522.0, + "reward": 1.03466796875, + "reward_std": 0.37151771783828735, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.20337004959583282, "step": 2325 }, { @@ -67440,27 +67440,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.005859375, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1939.0, - "completions/mean_length": 807.81640625, - "completions/mean_terminated_length": 800.5068969726562, - "completions/min_length": 34.0, - "completions/min_terminated_length": 34.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1082.58984375, + "completions/mean_terminated_length": 1037.1820068359375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.7940599129470001, - "grad_norm": 1.3092200756072998, - "kl": 4.61328125, - "learning_rate": 2.1156164741502639e-07, - "loss": 0.2665, - "num_tokens": 1253155187.0, - "reward": 1.96533203125, - "reward_std": 0.4647040367126465, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.908203125, - "rewards/format_reward/std": 0.289021372795105, - "rewards/tag_count_reward/mean": 0.94775390625, - "rewards/tag_count_reward/std": 0.16352833807468414, + "grad_norm": 7.475686550140381, + "kl": 2.251953125, + "learning_rate": 2.11642620416609e-07, + "loss": 0.0859, + "num_tokens": 1314906152.0, + "reward": 1.13916015625, + "reward_std": 0.3554784953594208, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17460595071315765, "step": 2326 }, { @@ -67469,27 +67469,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 744.103515625, - "completions/mean_terminated_length": 726.0297241210938, - "completions/min_length": 72.0, - "completions/min_terminated_length": 72.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 996.478515625, + "completions/mean_terminated_length": 964.742431640625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, "epoch": 0.7944012972603909, - "grad_norm": 1.8248697519302368, - "kl": 4.44140625, - "learning_rate": 2.1120855721883253e-07, - "loss": 0.2926, - "num_tokens": 1253610152.0, - "reward": 1.97021484375, - "reward_std": 0.4814947843551636, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.90234375, - "rewards/format_reward/std": 0.29713961482048035, - "rewards/tag_count_reward/mean": 0.95263671875, - "rewards/tag_count_reward/std": 0.1566462367773056, + "grad_norm": 6.651371479034424, + "kl": 2.62890625, + "learning_rate": 2.1128928621097985e-07, + "loss": 0.1064, + "num_tokens": 1315490333.0, + "reward": 1.1748046875, + "reward_std": 0.39619573950767517, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.1810678094625473, "step": 2327 }, { @@ -67498,27 +67498,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1892.0, - "completions/mean_length": 794.828125, - "completions/mean_terminated_length": 769.8645629882812, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1068.681640625, + "completions/mean_terminated_length": 1028.8719482421875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.7947426815737817, - "grad_norm": 1.264649748802185, - "kl": 4.9140625, - "learning_rate": 2.1085594787512579e-07, - "loss": 0.2962, - "num_tokens": 1254103168.0, - "reward": 1.90380859375, - "reward_std": 0.4397355318069458, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.95068359375, - "rewards/tag_count_reward/std": 0.1666538417339325, + "grad_norm": 2.616569995880127, + "kl": 2.34375, + "learning_rate": 2.109364331080749e-07, + "loss": 0.114, + "num_tokens": 1316123562.0, + "reward": 1.0732421875, + "reward_std": 0.34903356432914734, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.19067105650901794, "step": 2328 }, { @@ -67527,27 +67527,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 813.044921875, - "completions/mean_terminated_length": 783.4060668945312, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1079.38671875, + "completions/mean_terminated_length": 1023.3511962890625, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, "epoch": 0.7950840658871725, - "grad_norm": 1.5095208883285522, - "kl": 5.671875, - "learning_rate": 2.105038198843707e-07, - "loss": 0.3433, - "num_tokens": 1254601383.0, - "reward": 1.93017578125, - "reward_std": 0.5012025833129883, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.94775390625, - "rewards/tag_count_reward/std": 0.16868269443511963, + "grad_norm": 2.174304723739624, + "kl": 2.38671875, + "learning_rate": 2.10584061609085e-07, + "loss": 0.156, + "num_tokens": 1316758144.0, + "reward": 1.13330078125, + "reward_std": 0.4128798842430115, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.1830902397632599, "step": 2329 }, { @@ -67558,25 +67558,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1962.0, - "completions/mean_length": 786.5078125, - "completions/mean_terminated_length": 751.0441284179688, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1010.283203125, + "completions/mean_terminated_length": 981.1104125976562, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.7954254502005633, - "grad_norm": 1.4137483835220337, - "kl": 6.1875, - "learning_rate": 2.101521737463487e-07, - "loss": 0.3721, - "num_tokens": 1255070907.0, - "reward": 1.955078125, - "reward_std": 0.5198351740837097, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.18570774793624878, + "grad_norm": 2.4027020931243896, + "kl": 1.953125, + "learning_rate": 2.1023217221451603e-07, + "loss": 0.0821, + "num_tokens": 1317342241.0, + "reward": 1.18359375, + "reward_std": 0.3709033727645874, + "rewards/accuracy_reward/mean": 0.197265625, + "rewards/accuracy_reward/std": 0.3983237147331238, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.1789150983095169, "step": 2330 }, { @@ -67585,27 +67585,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1841.0, - "completions/mean_length": 796.857421875, - "completions/mean_terminated_length": 769.38720703125, - "completions/min_length": 33.0, - "completions/min_terminated_length": 33.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 1066.17578125, + "completions/mean_terminated_length": 1005.0664672851562, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.7957668345139541, - "grad_norm": 1.2894519567489624, - "kl": 5.2109375, - "learning_rate": 2.0980100996015694e-07, - "loss": 0.3191, - "num_tokens": 1255558130.0, - "reward": 1.88232421875, - "reward_std": 0.4153871536254883, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.1762780249118805, + "grad_norm": 3.0163819789886475, + "kl": 2.671875, + "learning_rate": 2.098807654241903e-07, + "loss": 0.1444, + "num_tokens": 1317967355.0, + "reward": 1.03759765625, + "reward_std": 0.3063843846321106, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.1830902397632599, "step": 2331 }, { @@ -67614,27 +67614,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 832.83203125, - "completions/mean_terminated_length": 783.4349365234375, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1114.794921875, + "completions/mean_terminated_length": 1054.6507568359375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, "epoch": 0.7961082188273448, - "grad_norm": 1.486045241355896, - "kl": 5.765625, - "learning_rate": 2.094503290242084e-07, - "loss": 0.345, - "num_tokens": 1256073788.0, - "reward": 1.859375, - "reward_std": 0.4540967345237732, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.18104934692382812, + "grad_norm": 4.766200542449951, + "kl": 2.84765625, + "learning_rate": 2.0952984173724348e-07, + "loss": 0.1824, + "num_tokens": 1318627378.0, + "reward": 1.0166015625, + "reward_std": 0.347095787525177, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.8994140625, + "rewards/tag_count_reward/std": 0.21476708352565765, "step": 2332 }, { @@ -67643,27 +67643,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 789.7265625, - "completions/mean_terminated_length": 777.3175659179688, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1114.95703125, + "completions/mean_terminated_length": 1060.979248046875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.7964496031407357, - "grad_norm": 0.76603102684021, - "kl": 4.84765625, - "learning_rate": 2.091001314362303e-07, - "loss": 0.2737, - "num_tokens": 1256561504.0, - "reward": 1.93310546875, - "reward_std": 0.4562763571739197, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.90234375, - "rewards/format_reward/std": 0.29713961482048035, - "rewards/tag_count_reward/mean": 0.95263671875, - "rewards/tag_count_reward/std": 0.15428605675697327, + "grad_norm": 5.242162704467773, + "kl": 2.87109375, + "learning_rate": 2.091794016521259e-07, + "loss": 0.1718, + "num_tokens": 1319281612.0, + "reward": 1.02490234375, + "reward_std": 0.34907281398773193, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.21874071657657623, "step": 2333 }, { @@ -67672,27 +67672,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 842.16796875, - "completions/mean_terminated_length": 800.755615234375, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1087.93359375, + "completions/mean_terminated_length": 1030.2899169921875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, "epoch": 0.7967909874541265, - "grad_norm": 1.4693493843078613, - "kl": 7.1640625, - "learning_rate": 2.087504176932643e-07, - "loss": 0.4425, - "num_tokens": 1257068758.0, - "reward": 1.83984375, - "reward_std": 0.4927568733692169, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.18829332292079926, + "grad_norm": 2.320801258087158, + "kl": 2.96875, + "learning_rate": 2.088294456666002e-07, + "loss": 0.154, + "num_tokens": 1319914698.0, + "reward": 1.00634765625, + "reward_std": 0.3809760808944702, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.89306640625, + "rewards/tag_count_reward/std": 0.2264673113822937, "step": 2334 }, { @@ -67701,27 +67701,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 839.037109375, - "completions/mean_terminated_length": 771.7340698242188, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1037.587890625, + "completions/mean_terminated_length": 987.8954467773438, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, "epoch": 0.7971323717675173, - "grad_norm": 2.0895612239837646, - "kl": 8.5703125, - "learning_rate": 2.0840118829166498e-07, - "loss": 0.5156, - "num_tokens": 1257570265.0, - "reward": 1.84326171875, - "reward_std": 0.5723297595977783, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.20506852865219116, + "grad_norm": 2.765597105026245, + "kl": 2.1953125, + "learning_rate": 2.0847997427774222e-07, + "loss": 0.1161, + "num_tokens": 1320517863.0, + "reward": 1.0703125, + "reward_std": 0.3526250422000885, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.21157780289649963, "step": 2335 }, { @@ -67730,27 +67730,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 842.220703125, - "completions/mean_terminated_length": 785.5071411132812, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1065.51171875, + "completions/mean_terminated_length": 1029.7125244140625, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, "epoch": 0.7974737560809081, - "grad_norm": 1.2615413665771484, - "kl": 7.96875, - "learning_rate": 2.0805244372709952e-07, - "loss": 0.4921, - "num_tokens": 1258081034.0, - "reward": 1.837890625, - "reward_std": 0.5462841987609863, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.19686728715896606, + "grad_norm": 6.627567768096924, + "kl": 2.373046875, + "learning_rate": 2.0813098798193875e-07, + "loss": 0.151, + "num_tokens": 1321142957.0, + "reward": 1.033203125, + "reward_std": 0.3327845335006714, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.900390625, + "rewards/tag_count_reward/std": 0.20770350098609924, "step": 2336 }, { @@ -67759,27 +67759,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 845.64453125, - "completions/mean_terminated_length": 804.3515625, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1129.93359375, + "completions/mean_terminated_length": 1068.729248046875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, "epoch": 0.7978151403942989, - "grad_norm": 0.9753010869026184, - "kl": 7.1875, - "learning_rate": 2.077041844945472e-07, - "loss": 0.4314, - "num_tokens": 1258589540.0, - "reward": 1.89111328125, - "reward_std": 0.5716447830200195, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.1974627822637558, + "grad_norm": 2.5888054370880127, + "kl": 2.49609375, + "learning_rate": 2.0778248727488807e-07, + "loss": 0.1183, + "num_tokens": 1321797019.0, + "reward": 1.08984375, + "reward_std": 0.3541601896286011, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.20036010444164276, "step": 2337 }, { @@ -67788,27 +67788,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 825.892578125, - "completions/mean_terminated_length": 765.7888793945312, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1048.158203125, + "completions/mean_terminated_length": 1003.267333984375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.7981565247076897, - "grad_norm": 1.5391372442245483, - "kl": 7.796875, - "learning_rate": 2.0735641108829813e-07, - "loss": 0.5046, - "num_tokens": 1259089293.0, - "reward": 1.82763671875, - "reward_std": 0.5773891806602478, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.21019071340560913, + "grad_norm": 3.669558525085449, + "kl": 2.0, + "learning_rate": 2.0743447265159849e-07, + "loss": 0.1147, + "num_tokens": 1322410572.0, + "reward": 1.0947265625, + "reward_std": 0.36656540632247925, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.19635941088199615, "step": 2338 }, { @@ -67817,27 +67817,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 814.05078125, - "completions/mean_terminated_length": 776.808837890625, - "completions/min_length": 89.0, - "completions/min_terminated_length": 89.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1080.580078125, + "completions/mean_terminated_length": 1022.494873046875, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, "epoch": 0.7984979090210805, - "grad_norm": 0.7947782278060913, - "kl": 7.4921875, - "learning_rate": 2.070091240019533e-07, - "loss": 0.4681, - "num_tokens": 1259586087.0, - "reward": 1.810546875, - "reward_std": 0.5679647326469421, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.20555779337882996, + "grad_norm": 6.862471580505371, + "kl": 2.46875, + "learning_rate": 2.0708694460638815e-07, + "loss": 0.1623, + "num_tokens": 1323043829.0, + "reward": 1.05029296875, + "reward_std": 0.3185489773750305, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.17416280508041382, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.1808057427406311, "step": 2339 }, { @@ -67846,27 +67846,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 868.98046875, - "completions/mean_terminated_length": 813.5255737304688, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1064.53515625, + "completions/mean_terminated_length": 1014.04931640625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, "epoch": 0.7988392933344712, - "grad_norm": 1.0265549421310425, - "kl": 6.1171875, - "learning_rate": 2.066623237284229e-07, - "loss": 0.3741, - "num_tokens": 1260107053.0, - "reward": 1.9287109375, - "reward_std": 0.5877367854118347, - "rewards/accuracy_reward/mean": 0.14453125, - "rewards/accuracy_reward/std": 0.35197147727012634, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.205490380525589, + "grad_norm": 2.4543957710266113, + "kl": 2.271484375, + "learning_rate": 2.0673990363288354e-07, + "loss": 0.1476, + "num_tokens": 1323664919.0, + "reward": 1.14453125, + "reward_std": 0.39314723014831543, + "rewards/accuracy_reward/mean": 0.181640625, + "rewards/accuracy_reward/std": 0.38592514395713806, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.19541551172733307, "step": 2340 }, { @@ -67877,25 +67877,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 904.48046875, - "completions/mean_terminated_length": 835.8219604492188, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1063.69921875, + "completions/mean_terminated_length": 1004.6004028320312, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, "epoch": 0.799180677647862, - "grad_norm": 1.025760293006897, - "kl": 7.3671875, - "learning_rate": 2.0631601075992677e-07, - "loss": 0.4521, - "num_tokens": 1260640691.0, - "reward": 1.78173828125, - "reward_std": 0.5654925107955933, - "rewards/accuracy_reward/mean": 0.038306452333927155, - "rewards/accuracy_reward/std": 0.19212882220745087, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.21603746712207794, + "grad_norm": 4.214763641357422, + "kl": 2.408203125, + "learning_rate": 2.0639335022401998e-07, + "loss": 0.1611, + "num_tokens": 1324280077.0, + "reward": 1.0205078125, + "reward_std": 0.34186527132987976, + "rewards/accuracy_reward/mean": 0.06854838877916336, + "rewards/accuracy_reward/std": 0.25293970108032227, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.2019064873456955, "step": 2341 }, { @@ -67904,27 +67904,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 867.6953125, - "completions/mean_terminated_length": 824.6882934570312, - "completions/min_length": 89.0, - "completions/min_terminated_length": 89.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1077.1796875, + "completions/mean_terminated_length": 1005.9454345703125, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, "epoch": 0.7995220619612529, - "grad_norm": 2.2666099071502686, - "kl": 6.3203125, - "learning_rate": 2.0597018558799272e-07, - "loss": 0.3268, - "num_tokens": 1261165735.0, - "reward": 1.83740234375, - "reward_std": 0.5497316718101501, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.20248481631278992, + "grad_norm": 1.7704473733901978, + "kl": 2.41796875, + "learning_rate": 2.060472848720396e-07, + "loss": 0.1374, + "num_tokens": 1324912377.0, + "reward": 1.04541015625, + "reward_std": 0.3238769769668579, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.1923593431711197, "step": 2342 }, { @@ -67933,27 +67933,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1902.0, - "completions/mean_length": 899.16796875, - "completions/mean_terminated_length": 857.3077392578125, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1136.974609375, + "completions/mean_terminated_length": 1098.01025390625, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, "epoch": 0.7998634462746437, - "grad_norm": 2.0839056968688965, - "kl": 4.3828125, - "learning_rate": 2.056248487034562e-07, - "loss": 0.297, - "num_tokens": 1261706445.0, - "reward": 1.892578125, - "reward_std": 0.4774249196052551, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.1726529598236084, + "grad_norm": 1.9443188905715942, + "kl": 1.7041015625, + "learning_rate": 2.0570170806849172e-07, + "loss": 0.097, + "num_tokens": 1325574844.0, + "reward": 1.05859375, + "reward_std": 0.33676382899284363, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.17685236036777496, "step": 2343 }, { @@ -67962,27 +67962,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 837.255859375, - "completions/mean_terminated_length": 793.1397094726562, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1037.166015625, + "completions/mean_terminated_length": 989.6216430664062, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.8002048305880345, - "grad_norm": 2.329362630844116, - "kl": 5.640625, - "learning_rate": 2.0528000059645995e-07, - "loss": 0.3844, - "num_tokens": 1262214304.0, - "reward": 1.83642578125, - "reward_std": 0.47069251537323, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.18874379992485046, + "grad_norm": 2.564016580581665, + "kl": 2.3671875, + "learning_rate": 2.0535662030423163e-07, + "loss": 0.0995, + "num_tokens": 1326185057.0, + "reward": 1.029296875, + "reward_std": 0.3521963357925415, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.1875101923942566, "step": 2344 }, { @@ -67991,27 +67991,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1960.0, - "completions/mean_length": 854.923828125, - "completions/mean_terminated_length": 828.728515625, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/clipped_ratio": 0.064453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1105.974609375, + "completions/mean_terminated_length": 1041.0751953125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.8005462149014253, - "grad_norm": 1.5461212396621704, - "kl": 4.9296875, - "learning_rate": 2.0493564175645256e-07, - "loss": 0.3225, - "num_tokens": 1262734457.0, - "reward": 1.92041015625, - "reward_std": 0.5226565599441528, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.1828918159008026, + "grad_norm": 8.724496841430664, + "kl": 2.8984375, + "learning_rate": 2.0501202206942008e-07, + "loss": 0.1994, + "num_tokens": 1326833748.0, + "reward": 1.08056640625, + "reward_std": 0.35309576988220215, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, + "rewards/format_reward/mean": 0.025390625, + "rewards/format_reward/std": 0.15746226906776428, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.19610315561294556, "step": 2345 }, { @@ -68020,27 +68020,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 901.26171875, - "completions/mean_terminated_length": 842.394287109375, - "completions/min_length": 206.0, - "completions/min_terminated_length": 206.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1101.806640625, + "completions/mean_terminated_length": 1044.995849609375, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, "epoch": 0.8008875992148161, - "grad_norm": 3.50020170211792, - "kl": 5.2265625, - "learning_rate": 2.0459177267218878e-07, - "loss": 0.3717, - "num_tokens": 1263283727.0, - "reward": 1.85595703125, - "reward_std": 0.47672832012176514, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.18353331089019775, + "grad_norm": 2.091207265853882, + "kl": 2.705078125, + "learning_rate": 2.0466791385352224e-07, + "loss": 0.1368, + "num_tokens": 1327485697.0, + "reward": 1.0517578125, + "reward_std": 0.3393837809562683, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.2019064873456955, "step": 2346 }, { @@ -68049,27 +68049,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 834.431640625, - "completions/mean_terminated_length": 797.8048095703125, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 990.908203125, + "completions/mean_terminated_length": 936.6427612304688, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.8012289835282069, - "grad_norm": 1.2660900354385376, - "kl": 5.4765625, - "learning_rate": 2.042483938317276e-07, - "loss": 0.3455, - "num_tokens": 1263788364.0, - "reward": 1.88525390625, - "reward_std": 0.5068372488021851, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.18019695580005646, + "grad_norm": 1.923886775970459, + "kl": 2.587890625, + "learning_rate": 2.0432429614530761e-07, + "loss": 0.1526, + "num_tokens": 1328070450.0, + "reward": 1.09814453125, + "reward_std": 0.3478143513202667, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.20408298075199127, "step": 2347 }, { @@ -68078,27 +68078,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 836.697265625, - "completions/mean_terminated_length": 792.5607299804688, - "completions/min_length": 256.0, - "completions/min_terminated_length": 256.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1012.892578125, + "completions/mean_terminated_length": 959.7556762695312, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, "epoch": 0.8015703678415976, - "grad_norm": 0.9718857407569885, - "kl": 5.6875, - "learning_rate": 2.0390550572243242e-07, - "loss": 0.3645, - "num_tokens": 1264292545.0, - "reward": 1.89111328125, - "reward_std": 0.4370141923427582, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.93798828125, - "rewards/tag_count_reward/std": 0.18422965705394745, + "grad_norm": 4.5194926261901855, + "kl": 2.65625, + "learning_rate": 2.0398116943284848e-07, + "loss": 0.176, + "num_tokens": 1328664843.0, + "reward": 1.04638671875, + "reward_std": 0.28775566816329956, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.033203125, + "rewards/format_reward/std": 0.17934183776378632, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.17980943620204926, "step": 2348 }, { @@ -68107,27 +68107,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 874.984375, - "completions/mean_terminated_length": 842.0079956054688, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1094.0625, + "completions/mean_terminated_length": 1051.232666015625, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, "epoch": 0.8019117521549884, - "grad_norm": 1.066348910331726, - "kl": 5.609375, - "learning_rate": 2.0356310883097045e-07, - "loss": 0.3583, - "num_tokens": 1264819481.0, - "reward": 1.90185546875, - "reward_std": 0.4515777826309204, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.16919739544391632, + "grad_norm": 3.0292017459869385, + "kl": 2.474609375, + "learning_rate": 2.0363853420352022e-07, + "loss": 0.1132, + "num_tokens": 1329303947.0, + "reward": 1.0634765625, + "reward_std": 0.3626502454280853, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.2043900191783905, "step": 2349 }, { @@ -68136,27 +68136,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1907.0, - "completions/mean_length": 794.236328125, - "completions/mean_terminated_length": 751.1777954101562, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1004.39453125, + "completions/mean_terminated_length": 979.3480224609375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, "epoch": 0.8022531364683793, - "grad_norm": 2.048433303833008, - "kl": 5.1171875, - "learning_rate": 2.0322120364331119e-07, - "loss": 0.3373, - "num_tokens": 1265299522.0, - "reward": 1.91845703125, - "reward_std": 0.4585955739021301, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.1794690042734146, + "grad_norm": 2.9248127937316895, + "kl": 2.705078125, + "learning_rate": 2.032963909439999e-07, + "loss": 0.1405, + "num_tokens": 1329891589.0, + "reward": 1.10498046875, + "reward_std": 0.3709397315979004, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.17361809313297272, "step": 2350 }, { @@ -68165,27 +68165,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 838.75, - "completions/mean_terminated_length": 802.2534790039062, - "completions/min_length": 83.0, - "completions/min_terminated_length": 83.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1010.548828125, + "completions/mean_terminated_length": 970.56591796875, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, "epoch": 0.8025945207817701, - "grad_norm": 4.7970685958862305, - "kl": 8.3046875, - "learning_rate": 2.028797906447268e-07, - "loss": 0.4665, - "num_tokens": 1265814274.0, - "reward": 1.85400390625, - "reward_std": 0.5632809400558472, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.19585449993610382, + "grad_norm": 1.8398419618606567, + "kl": 2.84765625, + "learning_rate": 2.0295474014026592e-07, + "loss": 0.1465, + "num_tokens": 1330494302.0, + "reward": 1.12255859375, + "reward_std": 0.3614022135734558, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.18227426707744598, "step": 2351 }, { @@ -68194,27 +68194,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1941.0, - "completions/mean_length": 829.587890625, - "completions/mean_terminated_length": 782.6307983398438, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1015.685546875, + "completions/mean_terminated_length": 982.3850708007812, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, "epoch": 0.8029359050951609, - "grad_norm": 2.448554277420044, - "kl": 6.640625, - "learning_rate": 2.025388703197903e-07, - "loss": 0.4144, - "num_tokens": 1266320575.0, - "reward": 1.95458984375, - "reward_std": 0.503436803817749, - "rewards/accuracy_reward/mean": 0.12298387289047241, - "rewards/accuracy_reward/std": 0.32875028252601624, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94677734375, - "rewards/tag_count_reward/std": 0.16910135746002197, + "grad_norm": 3.5638630390167236, + "kl": 2.75390625, + "learning_rate": 2.0261358227759687e-07, + "loss": 0.1284, + "num_tokens": 1331095885.0, + "reward": 1.14990234375, + "reward_std": 0.38127022981643677, + "rewards/accuracy_reward/mean": 0.1733870953321457, + "rewards/accuracy_reward/std": 0.37896379828453064, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.1804301142692566, "step": 2352 }, { @@ -68223,27 +68223,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 842.962890625, - "completions/mean_terminated_length": 809.0863037109375, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1064.154296875, + "completions/mean_terminated_length": 1019.9815673828125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, "epoch": 0.8032772894085517, - "grad_norm": 3.1435487270355225, - "kl": 7.37109375, - "learning_rate": 2.0219844315237595e-07, - "loss": 0.4264, - "num_tokens": 1266832396.0, - "reward": 1.88037109375, - "reward_std": 0.5150898694992065, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.1915329545736313, + "grad_norm": 3.6850082874298096, + "kl": 2.9609375, + "learning_rate": 2.0227291784057166e-07, + "loss": 0.1234, + "num_tokens": 1331720956.0, + "reward": 1.095703125, + "reward_std": 0.3694269061088562, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.19282633066177368, "step": 2353 }, { @@ -68252,27 +68252,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 875.8984375, - "completions/mean_terminated_length": 830.7261352539062, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1061.306640625, + "completions/mean_terminated_length": 1037.6260986328125, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, "epoch": 0.8036186737219425, - "grad_norm": 3.2759320735931396, - "kl": 8.3671875, - "learning_rate": 2.0185850962565782e-07, - "loss": 0.5176, - "num_tokens": 1267364200.0, - "reward": 1.90869140625, - "reward_std": 0.540012001991272, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.1932711899280548, + "grad_norm": 3.2790260314941406, + "kl": 2.232421875, + "learning_rate": 2.0193274731306777e-07, + "loss": 0.1092, + "num_tokens": 1332347689.0, + "reward": 1.166015625, + "reward_std": 0.3547057807445526, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3810062110424042, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.16541723906993866, "step": 2354 }, { @@ -68281,27 +68281,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 832.54296875, - "completions/mean_terminated_length": 805.8562622070312, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1057.513671875, + "completions/mean_terminated_length": 1039.791259765625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.8039600580353333, - "grad_norm": 1.754936933517456, - "kl": 4.85546875, - "learning_rate": 2.0151907022210917e-07, - "loss": 0.3045, - "num_tokens": 1267873438.0, - "reward": 1.91552734375, - "reward_std": 0.4817492365837097, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94287109375, - "rewards/tag_count_reward/std": 0.17070980370044708, + "grad_norm": 9.811931610107422, + "kl": 2.42578125, + "learning_rate": 2.0159307117826152e-07, + "loss": 0.0865, + "num_tokens": 1332972112.0, + "reward": 1.12646484375, + "reward_std": 0.36042797565460205, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.94482421875, + "rewards/tag_count_reward/std": 0.16028828918933868, "step": 2355 }, { @@ -68312,25 +68312,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1921.0, - "completions/mean_length": 756.267578125, - "completions/mean_terminated_length": 709.200439453125, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 954.892578125, + "completions/mean_terminated_length": 915.0628051757812, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.804301442348724, - "grad_norm": 1.607111930847168, - "kl": 6.8125, - "learning_rate": 2.0118012542350245e-07, - "loss": 0.4416, - "num_tokens": 1268334135.0, - "reward": 1.97607421875, - "reward_std": 0.5530889630317688, - "rewards/accuracy_reward/mean": 0.16015625, - "rewards/accuracy_reward/std": 0.3671095669269562, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.1798572540283203, + "grad_norm": 2.1637425422668457, + "kl": 2.763671875, + "learning_rate": 2.01253889918627e-07, + "loss": 0.1472, + "num_tokens": 1333534505.0, + "reward": 1.154296875, + "reward_std": 0.4104391932487488, + "rewards/accuracy_reward/mean": 0.173828125, + "rewards/accuracy_reward/std": 0.3793322443962097, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.1932915896177292, "step": 2356 }, { @@ -68339,27 +68339,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 842.298828125, - "completions/mean_terminated_length": 798.3663940429688, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1083.40234375, + "completions/mean_terminated_length": 1044.1910400390625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, "epoch": 0.8046428266621148, - "grad_norm": 1.7854558229446411, - "kl": 6.0625, - "learning_rate": 2.0084167571090753e-07, - "loss": 0.3664, - "num_tokens": 1268849936.0, - "reward": 1.86474609375, - "reward_std": 0.5049247741699219, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.1900404542684555, + "grad_norm": 4.516602039337158, + "kl": 2.390625, + "learning_rate": 2.0091520401593536e-07, + "loss": 0.1022, + "num_tokens": 1334173751.0, + "reward": 1.11474609375, + "reward_std": 0.38311493396759033, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.18731071054935455, "step": 2357 }, { @@ -68368,27 +68368,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1896.0, - "completions/mean_length": 905.337890625, - "completions/mean_terminated_length": 870.85107421875, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1109.412109375, + "completions/mean_terminated_length": 1086.8861083984375, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.8049842109755057, - "grad_norm": 2.130006790161133, - "kl": 4.6015625, - "learning_rate": 2.0050372156469214e-07, - "loss": 0.2799, - "num_tokens": 1269401965.0, - "reward": 1.88232421875, - "reward_std": 0.49413174390792847, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.1750049889087677, + "grad_norm": 2.129068613052368, + "kl": 2.58203125, + "learning_rate": 2.0057701395125392e-07, + "loss": 0.1083, + "num_tokens": 1334830266.0, + "reward": 1.08447265625, + "reward_std": 0.3720419108867645, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.19087830185890198, "step": 2358 }, { @@ -68397,27 +68397,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 845.869140625, - "completions/mean_terminated_length": 794.4542236328125, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1079.578125, + "completions/mean_terminated_length": 1029.864501953125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.8053255952888965, - "grad_norm": 1.481907844543457, - "kl": 5.859375, - "learning_rate": 2.0016626346452027e-07, - "loss": 0.39, - "num_tokens": 1269916570.0, - "reward": 1.8525390625, - "reward_std": 0.41104599833488464, - "rewards/accuracy_reward/mean": 0.01953125, - "rewards/accuracy_reward/std": 0.1385180652141571, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.17518849670886993, + "grad_norm": 3.94397234916687, + "kl": 2.357421875, + "learning_rate": 2.0023932020494602e-07, + "loss": 0.1501, + "num_tokens": 1335464530.0, + "reward": 0.99462890625, + "reward_std": 0.30469590425491333, + "rewards/accuracy_reward/mean": 0.033203125, + "rewards/accuracy_reward/std": 0.17934183776378632, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.18829776346683502, "step": 2359 }, { @@ -68426,27 +68426,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 792.029296875, - "completions/mean_terminated_length": 751.5140991210938, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1023.720703125, + "completions/mean_terminated_length": 988.54345703125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, "epoch": 0.8056669796022873, - "grad_norm": 1.5508971214294434, - "kl": 5.56640625, - "learning_rate": 1.998293018893518e-07, - "loss": 0.3609, - "num_tokens": 1270392025.0, - "reward": 1.89794921875, - "reward_std": 0.4793471395969391, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.17570249736309052, + "grad_norm": 3.0667355060577393, + "kl": 1.98828125, + "learning_rate": 1.9990212325666973e-07, + "loss": 0.1048, + "num_tokens": 1336058611.0, + "reward": 1.09228515625, + "reward_std": 0.3379972279071808, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.93994140625, + "rewards/tag_count_reward/std": 0.16531828045845032, "step": 2360 }, { @@ -68455,27 +68455,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1950.0, - "completions/mean_length": 796.81640625, - "completions/mean_terminated_length": 771.8924560546875, - "completions/min_length": 73.0, - "completions/min_terminated_length": 73.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1046.73046875, + "completions/mean_terminated_length": 1024.7464599609375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, "epoch": 0.8060083639156781, - "grad_norm": 1.2236746549606323, - "kl": 4.984375, - "learning_rate": 1.994928373174423e-07, - "loss": 0.3234, - "num_tokens": 1270871115.0, - "reward": 1.9189453125, - "reward_std": 0.5018002986907959, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.9443359375, - "rewards/tag_count_reward/std": 0.16867490112781525, + "grad_norm": 5.71998405456543, + "kl": 2.115234375, + "learning_rate": 1.9956542358537764e-07, + "loss": 0.0808, + "num_tokens": 1336665657.0, + "reward": 1.14404296875, + "reward_std": 0.38886284828186035, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.16935545206069946, "step": 2361 }, { @@ -68484,27 +68484,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1669.0, - "completions/mean_length": 734.2421875, - "completions/mean_terminated_length": 708.0717163085938, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 977.4453125, + "completions/mean_terminated_length": 929.3795776367188, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.8063497482290689, - "grad_norm": 1.1561851501464844, - "kl": 3.3671875, - "learning_rate": 1.991568702263415e-07, - "loss": 0.203, - "num_tokens": 1271328935.0, - "reward": 2.00146484375, - "reward_std": 0.4283190071582794, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, - "rewards/format_reward/mean": 0.912109375, - "rewards/format_reward/std": 0.2834126651287079, - "rewards/tag_count_reward/mean": 0.95654296875, - "rewards/tag_count_reward/std": 0.153054878115654, + "grad_norm": 1.9589004516601562, + "kl": 3.4765625, + "learning_rate": 1.992292216693162e-07, + "loss": 0.1934, + "num_tokens": 1337247997.0, + "reward": 1.158203125, + "reward_std": 0.4453861713409424, + "rewards/accuracy_reward/mean": 0.177734375, + "rewards/accuracy_reward/std": 0.3826628625392914, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.19990174472332, "step": 2362 }, { @@ -68513,27 +68513,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 866.05078125, - "completions/mean_terminated_length": 807.9220581054688, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1082.712890625, + "completions/mean_terminated_length": 1041.427734375, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, "epoch": 0.8066911325424597, - "grad_norm": 1.1642218828201294, - "kl": 6.8359375, - "learning_rate": 1.9882140109289342e-07, - "loss": 0.4629, - "num_tokens": 1271850433.0, - "reward": 1.8515625, - "reward_std": 0.5475321412086487, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.19384446740150452, + "grad_norm": 1.6740443706512451, + "kl": 2.189453125, + "learning_rate": 1.9889351798602454e-07, + "loss": 0.1094, + "num_tokens": 1337880426.0, + "reward": 1.08056640625, + "reward_std": 0.3808703124523163, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.19675986468791962, "step": 2363 }, { @@ -68542,27 +68542,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1956.0, - "completions/mean_length": 813.064453125, - "completions/mean_terminated_length": 762.86376953125, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1060.7265625, + "completions/mean_terminated_length": 1022.677490234375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, "epoch": 0.8070325168558504, - "grad_norm": 1.0698622465133667, - "kl": 6.3046875, - "learning_rate": 1.9848643039323499e-07, - "loss": 0.3831, - "num_tokens": 1272336914.0, - "reward": 1.9189453125, - "reward_std": 0.5730538368225098, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.19908507168293, + "grad_norm": 1.3629320859909058, + "kl": 2.458984375, + "learning_rate": 1.9855831301233414e-07, + "loss": 0.1286, + "num_tokens": 1338493710.0, + "reward": 1.13330078125, + "reward_std": 0.42396247386932373, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.19613726437091827, "step": 2364 }, { @@ -68571,27 +68571,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1850.0, - "completions/mean_length": 817.259765625, - "completions/mean_terminated_length": 774.991943359375, - "completions/min_length": 89.0, - "completions/min_terminated_length": 89.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1058.240234375, + "completions/mean_terminated_length": 1005.2901000976562, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.8073739011692412, - "grad_norm": 1.8406894207000732, - "kl": 6.78125, - "learning_rate": 1.9815195860279594e-07, - "loss": 0.4044, - "num_tokens": 1272834247.0, - "reward": 1.8310546875, - "reward_std": 0.5293779373168945, - "rewards/accuracy_reward/mean": 0.0463709682226181, - "rewards/accuracy_reward/std": 0.21049949526786804, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.19908507168293, + "grad_norm": 3.6040313243865967, + "kl": 2.9453125, + "learning_rate": 1.9822360722436838e-07, + "loss": 0.1602, + "num_tokens": 1339114425.0, + "reward": 1.05517578125, + "reward_std": 0.3515712320804596, + "rewards/accuracy_reward/mean": 0.07056451588869095, + "rewards/accuracy_reward/std": 0.25635457038879395, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.92236328125, + "rewards/tag_count_reward/std": 0.1914331614971161, "step": 2365 }, { @@ -68600,27 +68600,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 853.341796875, - "completions/mean_terminated_length": 814.8043823242188, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1109.423828125, + "completions/mean_terminated_length": 1042.6632080078125, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, "epoch": 0.807715285482632, - "grad_norm": 1.2284071445465088, - "kl": 4.6171875, - "learning_rate": 1.978179861962978e-07, - "loss": 0.2866, - "num_tokens": 1273352966.0, - "reward": 1.93408203125, - "reward_std": 0.4858850836753845, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94970703125, - "rewards/tag_count_reward/std": 0.15883907675743103, + "grad_norm": 7.065984725952148, + "kl": 2.259765625, + "learning_rate": 1.9788940109754114e-07, + "loss": 0.1651, + "num_tokens": 1339764258.0, + "reward": 1.08935546875, + "reward_std": 0.3808128535747528, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.18742799758911133, "step": 2366 }, { @@ -68629,27 +68629,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 804.623046875, - "completions/mean_terminated_length": 764.5140991210938, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1070.564453125, + "completions/mean_terminated_length": 1014.0185546875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, "epoch": 0.8080566697960229, - "grad_norm": 1.5557031631469727, - "kl": 6.0859375, - "learning_rate": 1.9748451364775318e-07, - "loss": 0.3996, - "num_tokens": 1273850533.0, - "reward": 1.84814453125, - "reward_std": 0.47291260957717896, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.1788075864315033, + "grad_norm": 2.8818702697753906, + "kl": 2.69921875, + "learning_rate": 1.9755569510655701e-07, + "loss": 0.1715, + "num_tokens": 1340397987.0, + "reward": 1.03369140625, + "reward_std": 0.3466808795928955, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.89892578125, + "rewards/tag_count_reward/std": 0.2221004068851471, "step": 2367 }, { @@ -68658,27 +68658,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 784.146484375, - "completions/mean_terminated_length": 764.0853881835938, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1065.4375, + "completions/mean_terminated_length": 1023.4135131835938, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, "epoch": 0.8083980541094137, - "grad_norm": 1.4546853303909302, - "kl": 4.65625, - "learning_rate": 1.9715154143046558e-07, - "loss": 0.2798, - "num_tokens": 1274335216.0, - "reward": 1.88720703125, - "reward_std": 0.4807460308074951, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93994140625, - "rewards/tag_count_reward/std": 0.17396998405456543, + "grad_norm": 1.6329728364944458, + "kl": 2.396484375, + "learning_rate": 1.9722248972541e-07, + "loss": 0.1337, + "num_tokens": 1341026691.0, + "reward": 1.13916015625, + "reward_std": 0.3848889470100403, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.92236328125, + "rewards/tag_count_reward/std": 0.1888602077960968, "step": 2368 }, { @@ -68687,27 +68687,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 800.859375, - "completions/mean_terminated_length": 765.7991943359375, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1036.474609375, + "completions/mean_terminated_length": 984.5482788085938, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.8087394384228045, - "grad_norm": 1.8968936204910278, - "kl": 5.03125, - "learning_rate": 1.968190700170279e-07, - "loss": 0.3277, - "num_tokens": 1274820872.0, - "reward": 1.97900390625, - "reward_std": 0.5076891183853149, - "rewards/accuracy_reward/mean": 0.142578125, - "rewards/accuracy_reward/std": 0.3499840497970581, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.94384765625, - "rewards/tag_count_reward/std": 0.17174777388572693, + "grad_norm": 2.1450865268707275, + "kl": 2.916015625, + "learning_rate": 1.9688978542738326e-07, + "loss": 0.1739, + "num_tokens": 1341632982.0, + "reward": 1.1767578125, + "reward_std": 0.4225959777832031, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.39980348944664, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.18347929418087006, "step": 2369 }, { @@ -68716,27 +68716,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 766.546875, - "completions/mean_terminated_length": 741.0199584960938, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 992.21875, + "completions/mean_terminated_length": 962.5381469726562, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.8090808227361953, - "grad_norm": 1.373109221458435, - "kl": 4.734375, - "learning_rate": 1.9648709987932282e-07, - "loss": 0.315, - "num_tokens": 1275286736.0, - "reward": 1.8740234375, - "reward_std": 0.4075538218021393, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.9443359375, - "rewards/tag_count_reward/std": 0.17083640396595, + "grad_norm": 3.4220244884490967, + "kl": 2.31640625, + "learning_rate": 1.9655758268504785e-07, + "loss": 0.114, + "num_tokens": 1342214390.0, + "reward": 1.0361328125, + "reward_std": 0.3053995668888092, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.17374257743358612, "step": 2370 }, { @@ -68745,27 +68745,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 822.357421875, - "completions/mean_terminated_length": 785.3661499023438, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1046.935546875, + "completions/mean_terminated_length": 997.7028198242188, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, "epoch": 0.8094222070495861, - "grad_norm": 2.1128485202789307, - "kl": 5.21875, - "learning_rate": 1.9615563148852092e-07, - "loss": 0.3068, - "num_tokens": 1275800407.0, - "reward": 1.89599609375, - "reward_std": 0.4957549571990967, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.17917592823505402, + "grad_norm": 1.891930103302002, + "kl": 2.294921875, + "learning_rate": 1.9622588197026268e-07, + "loss": 0.1328, + "num_tokens": 1342843045.0, + "reward": 1.10595703125, + "reward_std": 0.3962911367416382, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.91064453125, + "rewards/tag_count_reward/std": 0.2019934356212616, "step": 2371 }, { @@ -68774,27 +68774,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1932.0, - "completions/mean_length": 853.7734375, - "completions/mean_terminated_length": 802.6965942382812, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1111.962890625, + "completions/mean_terminated_length": 1053.703369140625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.8097635913629768, - "grad_norm": 2.7275819778442383, - "kl": 7.1796875, - "learning_rate": 1.9582466531508135e-07, - "loss": 0.4194, - "num_tokens": 1276318851.0, - "reward": 1.8271484375, - "reward_std": 0.48342227935791016, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.18266506493091583, + "grad_norm": 2.9643852710723877, + "kl": 2.24609375, + "learning_rate": 1.958946837541734e-07, + "loss": 0.1091, + "num_tokens": 1343493682.0, + "reward": 1.06201171875, + "reward_std": 0.3362460136413574, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.1927117109298706, "step": 2372 }, { @@ -68803,27 +68803,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 834.15625, - "completions/mean_terminated_length": 805.0240478515625, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1110.408203125, + "completions/mean_terminated_length": 1066.3087158203125, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, "epoch": 0.8101049756763676, - "grad_norm": 1.0214875936508179, - "kl": 5.5859375, - "learning_rate": 1.9549420182874956e-07, - "loss": 0.3217, - "num_tokens": 1276828035.0, - "reward": 1.89794921875, - "reward_std": 0.4587544798851013, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.94873046875, - "rewards/tag_count_reward/std": 0.17114026844501495, + "grad_norm": 6.519678115844727, + "kl": 2.53125, + "learning_rate": 1.955639885072121e-07, + "loss": 0.1239, + "num_tokens": 1344144307.0, + "reward": 1.06494140625, + "reward_std": 0.35815680027008057, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.076171875, + "rewards/format_reward/std": 0.26553234457969666, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.18770819902420044, "step": 2373 }, { @@ -68832,27 +68832,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 834.70703125, - "completions/mean_terminated_length": 782.814697265625, - "completions/min_length": 52.0, - "completions/min_terminated_length": 52.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1103.923828125, + "completions/mean_terminated_length": 1055.4599609375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, "epoch": 0.8104463599897584, - "grad_norm": 1.7794749736785889, - "kl": 6.98828125, - "learning_rate": 1.9516424149855829e-07, - "loss": 0.4101, - "num_tokens": 1277335549.0, - "reward": 1.78564453125, - "reward_std": 0.525837242603302, - "rewards/accuracy_reward/mean": 0.0234375, - "rewards/accuracy_reward/std": 0.15143637359142303, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.2028808295726776, + "grad_norm": 2.912588119506836, + "kl": 2.49609375, + "learning_rate": 1.9523379669909646e-07, + "loss": 0.143, + "num_tokens": 1344789660.0, + "reward": 1.0478515625, + "reward_std": 0.328630656003952, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.044921875, + "rewards/format_reward/std": 0.20733514428138733, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.20055793225765228, "step": 2374 }, { @@ -68861,27 +68861,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 886.5390625, - "completions/mean_terminated_length": 846.6505737304688, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1112.587890625, + "completions/mean_terminated_length": 1037.5970458984375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.8107877443031493, - "grad_norm": 1.9440349340438843, - "kl": 6.9609375, - "learning_rate": 1.94834784792826e-07, - "loss": 0.4378, - "num_tokens": 1277865841.0, - "reward": 1.88330078125, - "reward_std": 0.5041627883911133, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.20053589344024658, + "grad_norm": 2.573439121246338, + "kl": 2.619140625, + "learning_rate": 1.9490410879882897e-07, + "loss": 0.1592, + "num_tokens": 1345435689.0, + "reward": 1.107421875, + "reward_std": 0.37708961963653564, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.19415970146656036, "step": 2375 }, { @@ -68890,27 +68890,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 861.6484375, - "completions/mean_terminated_length": 803.30322265625, - "completions/min_length": 35.0, - "completions/min_terminated_length": 35.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1092.408203125, + "completions/mean_terminated_length": 1022.2913818359375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, "epoch": 0.8111291286165401, - "grad_norm": 1.0282351970672607, - "kl": 8.3515625, - "learning_rate": 1.9450583217915595e-07, - "loss": 0.5511, - "num_tokens": 1278386477.0, - "reward": 1.81591796875, - "reward_std": 0.5656579732894897, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.91162109375, - "rewards/tag_count_reward/std": 0.21643958985805511, + "grad_norm": 2.9239754676818848, + "kl": 2.8671875, + "learning_rate": 1.9457492527469628e-07, + "loss": 0.1804, + "num_tokens": 1346074474.0, + "reward": 1.0517578125, + "reward_std": 0.3837081491947174, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.8955078125, + "rewards/tag_count_reward/std": 0.22134028375148773, "step": 2376 }, { @@ -68919,27 +68919,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 825.478515625, - "completions/mean_terminated_length": 767.9774780273438, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1009.91796875, + "completions/mean_terminated_length": 969.9107055664062, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.8114705129299309, - "grad_norm": 1.0348620414733887, - "kl": 6.08984375, - "learning_rate": 1.9417738412443647e-07, - "loss": 0.3737, - "num_tokens": 1278889826.0, - "reward": 1.89501953125, - "reward_std": 0.5118416547775269, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.19511640071868896, + "grad_norm": 2.1076366901397705, + "kl": 2.294921875, + "learning_rate": 1.9424624659426897e-07, + "loss": 0.1111, + "num_tokens": 1346672256.0, + "reward": 1.091796875, + "reward_std": 0.3356272578239441, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.18078525364398956, "step": 2377 }, { @@ -68948,27 +68948,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 824.9453125, - "completions/mean_terminated_length": 790.5621948242188, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1108.390625, + "completions/mean_terminated_length": 1054.032958984375, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, "epoch": 0.8118118972433217, - "grad_norm": 1.0888521671295166, - "kl": 5.9453125, - "learning_rate": 1.938494410948394e-07, - "loss": 0.3837, - "num_tokens": 1279386534.0, - "reward": 1.8447265625, - "reward_std": 0.4803759455680847, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.18992790579795837, + "grad_norm": 2.6519134044647217, + "kl": 3.03125, + "learning_rate": 1.9391807322440007e-07, + "loss": 0.1496, + "num_tokens": 1347314088.0, + "reward": 1.0546875, + "reward_std": 0.4044151306152344, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.1971776783466339, "step": 2378 }, { @@ -68977,27 +68977,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 778.08203125, - "completions/mean_terminated_length": 747.6040649414062, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1048.755859375, + "completions/mean_terminated_length": 984.3555297851562, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, "epoch": 0.8121532815567125, - "grad_norm": 1.3943076133728027, - "kl": 5.34375, - "learning_rate": 1.9352200355581988e-07, - "loss": 0.3171, - "num_tokens": 1279861152.0, - "reward": 1.9248046875, - "reward_std": 0.4652273654937744, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.1723288893699646, + "grad_norm": 2.5883829593658447, + "kl": 3.203125, + "learning_rate": 1.9359040563122522e-07, + "loss": 0.184, + "num_tokens": 1347927291.0, + "reward": 1.0849609375, + "reward_std": 0.38237401843070984, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.19707830250263214, "step": 2379 }, { @@ -69006,27 +69006,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 828.548828125, - "completions/mean_terminated_length": 791.7444458007812, - "completions/min_length": 240.0, - "completions/min_terminated_length": 240.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1114.244140625, + "completions/mean_terminated_length": 1056.1265869140625, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, "epoch": 0.8124946658701032, - "grad_norm": 1.807580828666687, - "kl": 6.75, - "learning_rate": 1.9319507197211538e-07, - "loss": 0.445, - "num_tokens": 1280359017.0, - "reward": 1.859375, - "reward_std": 0.5383387804031372, - "rewards/accuracy_reward/mean": 0.08669354766607285, - "rewards/accuracy_reward/std": 0.281669557094574, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.2009030431509018, + "grad_norm": 2.717782735824585, + "kl": 2.8203125, + "learning_rate": 1.932632442801616e-07, + "loss": 0.1397, + "num_tokens": 1348571432.0, + "reward": 1.10302734375, + "reward_std": 0.3820044696331024, + "rewards/accuracy_reward/mean": 0.13709677755832672, + "rewards/accuracy_reward/std": 0.34429675340652466, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.1996237188577652, "step": 2380 }, { @@ -69035,27 +69035,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1957.0, - "completions/mean_length": 798.341796875, - "completions/mean_terminated_length": 765.7855834960938, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1017.66796875, + "completions/mean_terminated_length": 977.9594116210938, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, "epoch": 0.812836050183494, - "grad_norm": 2.365671396255493, - "kl": 5.4453125, - "learning_rate": 1.9286864680774578e-07, - "loss": 0.3664, - "num_tokens": 1280844168.0, - "reward": 1.876953125, - "reward_std": 0.4589555859565735, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.17832663655281067, + "grad_norm": 2.6087486743927, + "kl": 2.419921875, + "learning_rate": 1.929365896359074e-07, + "loss": 0.1169, + "num_tokens": 1349168878.0, + "reward": 1.1083984375, + "reward_std": 0.39976656436920166, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.083984375, + "rewards/format_reward/std": 0.2776356339454651, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.18657785654067993, "step": 2381 }, { @@ -69064,27 +69064,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1962.0, - "completions/mean_length": 853.376953125, - "completions/mean_terminated_length": 822.2545166015625, - "completions/min_length": 49.0, - "completions/min_terminated_length": 49.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1147.646484375, + "completions/mean_terminated_length": 1097.5238037109375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, "epoch": 0.8131774344968848, - "grad_norm": 1.2380436658859253, - "kl": 5.953125, - "learning_rate": 1.9254272852601193e-07, - "loss": 0.4262, - "num_tokens": 1281364601.0, - "reward": 1.86865234375, - "reward_std": 0.4679795801639557, - "rewards/accuracy_reward/mean": 0.04435483738780022, - "rewards/accuracy_reward/std": 0.2060900777578354, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.17914927005767822, + "grad_norm": 5.247420310974121, + "kl": 2.287109375, + "learning_rate": 1.9261044216244077e-07, + "loss": 0.134, + "num_tokens": 1349839977.0, + "reward": 1.0546875, + "reward_std": 0.35196352005004883, + "rewards/accuracy_reward/mean": 0.09072580933570862, + "rewards/accuracy_reward/std": 0.2875087857246399, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.19246920943260193, "step": 2382 }, { @@ -69093,27 +69093,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 691.244140625, - "completions/mean_terminated_length": 672.4376220703125, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1001.693359375, + "completions/mean_terminated_length": 952.4805297851562, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.8135188188102757, - "grad_norm": 1.8697482347488403, - "kl": 4.03515625, - "learning_rate": 1.9221731758949498e-07, - "loss": 0.3017, - "num_tokens": 1281791078.0, - "reward": 2.037109375, - "reward_std": 0.45592376589775085, - "rewards/accuracy_reward/mean": 0.150390625, - "rewards/accuracy_reward/std": 0.35780346393585205, - "rewards/format_reward/mean": 0.92578125, - "rewards/format_reward/std": 0.2623828947544098, - "rewards/tag_count_reward/mean": 0.9609375, - "rewards/tag_count_reward/std": 0.1465102881193161, + "grad_norm": 2.1906330585479736, + "kl": 2.755859375, + "learning_rate": 1.9228480232301977e-07, + "loss": 0.1319, + "num_tokens": 1350425404.0, + "reward": 1.2001953125, + "reward_std": 0.3937082290649414, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41643625497817993, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.9189453125, + "rewards/tag_count_reward/std": 0.19477635622024536, "step": 2383 }, { @@ -69122,27 +69122,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1913.0, - "completions/mean_length": 785.208984375, - "completions/mean_terminated_length": 747.0965576171875, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1049.00390625, + "completions/mean_terminated_length": 980.1795654296875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.8138602031236665, - "grad_norm": 0.859073281288147, - "kl": 6.3984375, - "learning_rate": 1.918924144600566e-07, - "loss": 0.4384, - "num_tokens": 1282270705.0, - "reward": 1.91259765625, - "reward_std": 0.47794193029403687, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.94384765625, - "rewards/tag_count_reward/std": 0.1659528613090515, + "grad_norm": 4.723444938659668, + "kl": 2.091796875, + "learning_rate": 1.9195967058018125e-07, + "loss": 0.1269, + "num_tokens": 1351040094.0, + "reward": 1.064453125, + "reward_std": 0.3590174913406372, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.18962833285331726, "step": 2384 }, { @@ -69151,27 +69151,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 847.498046875, - "completions/mean_terminated_length": 798.6971435546875, - "completions/min_length": 88.0, - "completions/min_terminated_length": 88.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1120.302734375, + "completions/mean_terminated_length": 1080.6253662109375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.8142015874370573, - "grad_norm": 2.2664906978607178, - "kl": 5.44921875, - "learning_rate": 1.915680195988369e-07, - "loss": 0.3745, - "num_tokens": 1282790496.0, - "reward": 1.89892578125, - "reward_std": 0.4867627024650574, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17533229291439056, + "grad_norm": 2.464592933654785, + "kl": 2.158203125, + "learning_rate": 1.9163504739574054e-07, + "loss": 0.1247, + "num_tokens": 1351699561.0, + "reward": 1.109375, + "reward_std": 0.3473352789878845, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.179597407579422, "step": 2385 }, { @@ -69180,27 +69180,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 824.65234375, - "completions/mean_terminated_length": 800.2828979492188, - "completions/min_length": 19.0, - "completions/min_terminated_length": 19.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1137.53515625, + "completions/mean_terminated_length": 1096.6571044921875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, "epoch": 0.8145429717504481, - "grad_norm": 1.8327305316925049, - "kl": 4.3828125, - "learning_rate": 1.912441334662554e-07, - "loss": 0.296, - "num_tokens": 1283294270.0, - "reward": 1.8974609375, - "reward_std": 0.38229382038116455, + "grad_norm": 1.6607451438903809, + "kl": 2.5888671875, + "learning_rate": 1.9131093323079044e-07, + "loss": 0.1336, + "num_tokens": 1352363531.0, + "reward": 1.0068359375, + "reward_std": 0.34897375106811523, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.9140625, - "rewards/format_reward/std": 0.28054583072662354, - "rewards/tag_count_reward/mean": 0.9521484375, - "rewards/tag_count_reward/std": 0.15453127026557922, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.20437131822109222, "step": 2386 }, { @@ -69209,27 +69209,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1902.0, - "completions/mean_length": 763.08984375, - "completions/mean_terminated_length": 726.9678344726562, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1062.66015625, + "completions/mean_terminated_length": 1026.757080078125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, "epoch": 0.8148843560638389, - "grad_norm": 1.331518292427063, - "kl": 5.4609375, - "learning_rate": 1.9092075652200894e-07, - "loss": 0.3553, - "num_tokens": 1283764812.0, - "reward": 1.92724609375, - "reward_std": 0.44393885135650635, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.95068359375, - "rewards/tag_count_reward/std": 0.15914559364318848, + "grad_norm": 1.9651501178741455, + "kl": 1.74609375, + "learning_rate": 1.9098732854570104e-07, + "loss": 0.0814, + "num_tokens": 1352987453.0, + "reward": 1.123046875, + "reward_std": 0.35017603635787964, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.076171875, + "rewards/format_reward/std": 0.26553234457969666, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.17942707240581512, "step": 2387 }, { @@ -69240,25 +69240,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 739.494140625, - "completions/mean_terminated_length": 694.5556030273438, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1036.416015625, + "completions/mean_terminated_length": 1001.6748046875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.8152257403772296, - "grad_norm": 1.7126271724700928, - "kl": 7.109375, - "learning_rate": 1.9059788922507213e-07, - "loss": 0.466, - "num_tokens": 1284229129.0, - "reward": 1.88134765625, - "reward_std": 0.47255924344062805, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.1794690042734146, + "grad_norm": 1.5949029922485352, + "kl": 2.21484375, + "learning_rate": 1.906642338001182e-07, + "loss": 0.1206, + "num_tokens": 1353603794.0, + "reward": 1.111328125, + "reward_std": 0.404132604598999, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.080078125, + "rewards/format_reward/std": 0.271679550409317, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.17685236036777496, "step": 2388 }, { @@ -69267,27 +69267,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 827.318359375, - "completions/mean_terminated_length": 795.5170288085938, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1140.08203125, + "completions/mean_terminated_length": 1077.5323486328125, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, "epoch": 0.8155671246906204, - "grad_norm": 3.0533907413482666, - "kl": 6.9296875, - "learning_rate": 1.902755320336961e-07, - "loss": 0.4197, - "num_tokens": 1284730172.0, - "reward": 1.88134765625, - "reward_std": 0.45382267236709595, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.18082687258720398, + "grad_norm": 5.9671173095703125, + "kl": 2.89453125, + "learning_rate": 1.9034164945296415e-07, + "loss": 0.1267, + "num_tokens": 1354264972.0, + "reward": 1.078125, + "reward_std": 0.4102986454963684, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.08203125, + "rewards/format_reward/std": 0.2746807038784027, + "rewards/tag_count_reward/mean": 0.904296875, + "rewards/tag_count_reward/std": 0.20421463251113892, "step": 2389 }, { @@ -69296,27 +69296,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 828.9609375, - "completions/mean_terminated_length": 789.6370849609375, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1179.158203125, + "completions/mean_terminated_length": 1103.526611328125, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, "epoch": 0.8159085090040112, - "grad_norm": 3.313844680786133, - "kl": 8.3515625, - "learning_rate": 1.899536854054079e-07, - "loss": 0.5099, - "num_tokens": 1285232248.0, - "reward": 1.8818359375, - "reward_std": 0.4815102517604828, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.19121153652668, + "grad_norm": 3.905566453933716, + "kl": 2.53515625, + "learning_rate": 1.9001957596243557e-07, + "loss": 0.1548, + "num_tokens": 1354946349.0, + "reward": 1.08154296875, + "reward_std": 0.398532509803772, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.08984375, + "rewards/format_reward/std": 0.2862374484539032, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.20725619792938232, "step": 2390 }, { @@ -69325,27 +69325,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 807.07421875, - "completions/mean_terminated_length": 767.0443115234375, - "completions/min_length": 57.0, - "completions/min_terminated_length": 57.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 1093.697265625, + "completions/mean_terminated_length": 1042.64404296875, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, "epoch": 0.816249893317402, - "grad_norm": 3.641836404800415, - "kl": 8.265625, - "learning_rate": 1.8963234979700986e-07, - "loss": 0.4863, - "num_tokens": 1285725182.0, - "reward": 1.8818359375, - "reward_std": 0.5220977663993835, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.2033400982618332, + "grad_norm": 1.9376497268676758, + "kl": 2.126953125, + "learning_rate": 1.896980137860038e-07, + "loss": 0.0922, + "num_tokens": 1355586034.0, + "reward": 1.11962890625, + "reward_std": 0.38722673058509827, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.18640044331550598, "step": 2391 }, { @@ -69354,27 +69354,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1901.0, - "completions/mean_length": 738.0390625, - "completions/mean_terminated_length": 698.5029907226562, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1007.431640625, + "completions/mean_terminated_length": 951.7633666992188, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, "epoch": 0.8165912776307929, - "grad_norm": 1.327378273010254, - "kl": 7.3359375, - "learning_rate": 1.8931152566457903e-07, - "loss": 0.4818, - "num_tokens": 1286178706.0, - "reward": 1.951171875, - "reward_std": 0.5315266847610474, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.1712302714586258, + "grad_norm": 1.6739675998687744, + "kl": 2.708984375, + "learning_rate": 1.8937696338041397e-07, + "loss": 0.1664, + "num_tokens": 1356177487.0, + "reward": 1.17041015625, + "reward_std": 0.3682592809200287, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.3937928080558777, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.19093835353851318, "step": 2392 }, { @@ -69383,27 +69383,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 775.56640625, - "completions/mean_terminated_length": 731.86669921875, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1069.630859375, + "completions/mean_terminated_length": 1017.2901000976562, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.8169326619441837, - "grad_norm": 2.8124125003814697, - "kl": 7.0390625, - "learning_rate": 1.8899121346346682e-07, - "loss": 0.4314, - "num_tokens": 1286650340.0, - "reward": 1.88525390625, - "reward_std": 0.4600207805633545, - "rewards/accuracy_reward/mean": 0.058467742055654526, - "rewards/accuracy_reward/std": 0.23486268520355225, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.93798828125, - "rewards/tag_count_reward/std": 0.18222707509994507, + "grad_norm": 3.429978847503662, + "kl": 3.609375, + "learning_rate": 1.890564252016843e-07, + "loss": 0.2005, + "num_tokens": 1356799682.0, + "reward": 1.091796875, + "reward_std": 0.3947150707244873, + "rewards/accuracy_reward/mean": 0.11088709533214569, + "rewards/accuracy_reward/std": 0.3143092691898346, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.20075078308582306, "step": 2393 }, { @@ -69414,25 +69414,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1967.0, - "completions/mean_length": 825.013671875, - "completions/mean_terminated_length": 785.5625, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1103.44921875, + "completions/mean_terminated_length": 1072.9798583984375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.8172740462575745, - "grad_norm": 1.3582987785339355, - "kl": 6.0390625, - "learning_rate": 1.8867141364829758e-07, - "loss": 0.3957, - "num_tokens": 1287145483.0, - "reward": 1.9453125, - "reward_std": 0.43716537952423096, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.912109375, - "rewards/format_reward/std": 0.2834126651287079, - "rewards/tag_count_reward/mean": 0.951171875, - "rewards/tag_count_reward/std": 0.16495446860790253, + "grad_norm": 2.6521358489990234, + "kl": 1.74609375, + "learning_rate": 1.887363997051051e-07, + "loss": 0.042, + "num_tokens": 1357437384.0, + "reward": 1.109375, + "reward_std": 0.3834799826145172, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.17737028002738953, "step": 2394 }, { @@ -69441,27 +69441,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 801.60546875, - "completions/mean_terminated_length": 753.5699462890625, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1051.404296875, + "completions/mean_terminated_length": 1017.1777954101562, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.8176154305709653, - "grad_norm": 1.6503441333770752, - "kl": 6.921875, - "learning_rate": 1.8835212667296873e-07, - "loss": 0.434, - "num_tokens": 1287640433.0, - "reward": 1.86767578125, - "reward_std": 0.42626187205314636, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.18751464784145355, + "grad_norm": 2.0698275566101074, + "kl": 1.990234375, + "learning_rate": 1.8841688734523898e-07, + "loss": 0.0895, + "num_tokens": 1358060231.0, + "reward": 1.0849609375, + "reward_std": 0.34596845507621765, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.1784525215625763, "step": 2395 }, { @@ -69470,27 +69470,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 781.658203125, - "completions/mean_terminated_length": 743.4385986328125, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1077.01171875, + "completions/mean_terminated_length": 1039.5902099609375, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, "epoch": 0.817956814884356, - "grad_norm": 0.7858288288116455, - "kl": 6.015625, - "learning_rate": 1.8803335299064998e-07, - "loss": 0.395, - "num_tokens": 1288118450.0, - "reward": 1.8798828125, - "reward_std": 0.48168665170669556, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.18536527454853058, + "grad_norm": 2.6365702152252197, + "kl": 2.4140625, + "learning_rate": 1.8809788857591918e-07, + "loss": 0.1363, + "num_tokens": 1358689469.0, + "reward": 1.11181640625, + "reward_std": 0.3518145680427551, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.16533562541007996, "step": 2396 }, { @@ -69499,27 +69499,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.095703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1963.0, - "completions/mean_length": 895.197265625, - "completions/mean_terminated_length": 836.0184936523438, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1195.251953125, + "completions/mean_terminated_length": 1105.0042724609375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, "epoch": 0.8182981991977468, - "grad_norm": 1.5382969379425049, - "kl": 7.4140625, - "learning_rate": 1.8771509305378186e-07, - "loss": 0.4673, - "num_tokens": 1288652135.0, - "reward": 1.7978515625, - "reward_std": 0.5393427014350891, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.20277541875839233, + "grad_norm": 1.8713483810424805, + "kl": 2.53515625, + "learning_rate": 1.8777940385024983e-07, + "loss": 0.1401, + "num_tokens": 1359376782.0, + "reward": 1.0751953125, + "reward_std": 0.3872072100639343, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.076171875, + "rewards/format_reward/std": 0.26553234457969666, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.20614035427570343, "step": 2397 }, { @@ -69528,27 +69528,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1920.0, - "completions/mean_length": 825.15234375, - "completions/mean_terminated_length": 772.8513793945312, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1071.82421875, + "completions/mean_terminated_length": 1002.3890991210938, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, "epoch": 0.8186395835111376, - "grad_norm": 1.5090433359146118, - "kl": 4.8515625, - "learning_rate": 1.8739734731407646e-07, - "loss": 0.3283, - "num_tokens": 1289155061.0, - "reward": 1.888671875, - "reward_std": 0.46926945447921753, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.18702034652233124, + "grad_norm": 1.6924453973770142, + "kl": 2.40625, + "learning_rate": 1.8746143362060473e-07, + "loss": 0.1272, + "num_tokens": 1360006004.0, + "reward": 1.08203125, + "reward_std": 0.3599720597267151, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.1937359869480133, "step": 2398 }, { @@ -69557,27 +69557,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 781.5, - "completions/mean_terminated_length": 743.275634765625, - "completions/min_length": 63.0, - "completions/min_terminated_length": 63.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1064.685546875, + "completions/mean_terminated_length": 1018.435546875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, "epoch": 0.8189809678245284, - "grad_norm": 1.3026745319366455, - "kl": 5.23046875, - "learning_rate": 1.870801162225155e-07, - "loss": 0.3318, - "num_tokens": 1289630501.0, - "reward": 1.923828125, - "reward_std": 0.4812939167022705, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.17468871176242828, + "grad_norm": 1.4412189722061157, + "kl": 2.37109375, + "learning_rate": 1.8714397833862717e-07, + "loss": 0.1362, + "num_tokens": 1360626435.0, + "reward": 1.08203125, + "reward_std": 0.3512256443500519, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.18881024420261383, "step": 2399 }, { @@ -69586,27 +69586,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 808.80078125, - "completions/mean_terminated_length": 758.4268188476562, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1084.49609375, + "completions/mean_terminated_length": 1022.399169921875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, "epoch": 0.8193223521379193, - "grad_norm": 1.4187021255493164, - "kl": 6.4609375, - "learning_rate": 1.8676340022935073e-07, - "loss": 0.4109, - "num_tokens": 1290125391.0, - "reward": 1.85498046875, - "reward_std": 0.4715687036514282, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.18454577028751373, + "grad_norm": 1.838685393333435, + "kl": 3.03125, + "learning_rate": 1.8682703845522837e-07, + "loss": 0.178, + "num_tokens": 1361262481.0, + "reward": 1.0205078125, + "reward_std": 0.3525121212005615, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.9052734375, + "rewards/tag_count_reward/std": 0.20939649641513824, "step": 2400 }, { @@ -69615,27 +69615,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 882.208984375, - "completions/mean_terminated_length": 817.309326171875, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1141.29296875, + "completions/mean_terminated_length": 1078.8267822265625, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, "epoch": 0.8196637364513101, - "grad_norm": 1.629425048828125, - "kl": 6.328125, - "learning_rate": 1.8644719978410227e-07, - "loss": 0.4187, - "num_tokens": 1290648746.0, - "reward": 1.81005859375, - "reward_std": 0.46727508306503296, - "rewards/accuracy_reward/mean": 0.015625, - "rewards/accuracy_reward/std": 0.12414088100194931, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.198433056473732, + "grad_norm": 2.0104587078094482, + "kl": 2.107421875, + "learning_rate": 1.865106144205883e-07, + "loss": 0.0971, + "num_tokens": 1361918487.0, + "reward": 1.01806640625, + "reward_std": 0.34818506240844727, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.2029850035905838, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.19858227670192719, "step": 2401 }, { @@ -69644,27 +69644,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 880.744140625, - "completions/mean_terminated_length": 805.515625, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1093.822265625, + "completions/mean_terminated_length": 1034.4337158203125, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, "epoch": 0.8200051207647009, - "grad_norm": 2.2330965995788574, - "kl": 7.90625, - "learning_rate": 1.861315153355592e-07, - "loss": 0.4897, - "num_tokens": 1291173639.0, - "reward": 1.82275390625, - "reward_std": 0.5710577964782715, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.2157054841518402, + "grad_norm": 4.024048328399658, + "kl": 1.837890625, + "learning_rate": 1.8619470668415351e-07, + "loss": 0.1052, + "num_tokens": 1362552476.0, + "reward": 1.12939453125, + "reward_std": 0.3992331922054291, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.076171875, + "rewards/format_reward/std": 0.26553234457969666, + "rewards/tag_count_reward/mean": 0.92041015625, + "rewards/tag_count_reward/std": 0.1867384910583496, "step": 2402 }, { @@ -69673,27 +69673,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 806.13671875, - "completions/mean_terminated_length": 776.33203125, - "completions/min_length": 68.0, - "completions/min_terminated_length": 68.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1024.59375, + "completions/mean_terminated_length": 982.9918212890625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.8203465050780917, - "grad_norm": 1.571038007736206, - "kl": 5.5390625, - "learning_rate": 1.8581634733177758e-07, - "loss": 0.3696, - "num_tokens": 1291659277.0, - "reward": 1.8994140625, - "reward_std": 0.5044021606445312, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.9443359375, - "rewards/tag_count_reward/std": 0.17083640396595, + "grad_norm": 2.069023847579956, + "kl": 2.078125, + "learning_rate": 1.858793156946376e-07, + "loss": 0.1215, + "num_tokens": 1363149964.0, + "reward": 1.11328125, + "reward_std": 0.3569978177547455, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.16907380521297455, "step": 2403 }, { @@ -69702,27 +69702,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.06640625, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1954.0, - "completions/mean_length": 880.21875, - "completions/mean_terminated_length": 797.15478515625, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1073.671875, + "completions/mean_terminated_length": 1038.1700439453125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.8206878893914825, - "grad_norm": 1.3357652425765991, - "kl": 7.984375, - "learning_rate": 1.8550169622008078e-07, - "loss": 0.5268, - "num_tokens": 1292183805.0, - "reward": 1.81005859375, - "reward_std": 0.5667085647583008, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.91552734375, - "rewards/tag_count_reward/std": 0.21346116065979004, + "grad_norm": 4.195062637329102, + "kl": 2.16796875, + "learning_rate": 1.855644419000202e-07, + "loss": 0.1249, + "num_tokens": 1363773540.0, + "reward": 1.0615234375, + "reward_std": 0.344260573387146, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.19436383247375488, "step": 2404 }, { @@ -69731,27 +69731,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 781.201171875, - "completions/mean_terminated_length": 740.336669921875, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 987.3515625, + "completions/mean_terminated_length": 941.9878540039062, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, "epoch": 0.8210292737048732, - "grad_norm": 1.2438210248947144, - "kl": 5.265625, - "learning_rate": 1.851875624470586e-07, - "loss": 0.3265, - "num_tokens": 1292659876.0, - "reward": 1.8740234375, - "reward_std": 0.4153513014316559, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.9443359375, - "rewards/tag_count_reward/std": 0.1693984717130661, + "grad_norm": 2.212071180343628, + "kl": 1.78515625, + "learning_rate": 1.85250085747546e-07, + "loss": 0.0979, + "num_tokens": 1364355160.0, + "reward": 1.08251953125, + "reward_std": 0.380862832069397, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17738577723503113, "step": 2405 }, { @@ -69760,27 +69760,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 890.201171875, - "completions/mean_terminated_length": 825.7463989257812, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1059.4765625, + "completions/mean_terminated_length": 1017.1976318359375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.821370658018264, - "grad_norm": 2.0447282791137695, - "kl": 7.4296875, - "learning_rate": 1.8487394645856636e-07, - "loss": 0.4506, - "num_tokens": 1293197355.0, - "reward": 1.82275390625, - "reward_std": 0.587771475315094, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.822265625, - "rewards/format_reward/std": 0.3826628625392914, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.21203739941120148, + "grad_norm": 3.7569053173065186, + "kl": 1.587890625, + "learning_rate": 1.84936247683725e-07, + "loss": 0.0758, + "num_tokens": 1364979308.0, + "reward": 1.16650390625, + "reward_std": 0.39158207178115845, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.16754020750522614, "step": 2406 }, { @@ -69789,27 +69789,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1880.0, - "completions/mean_length": 772.44921875, - "completions/mean_terminated_length": 744.443115234375, - "completions/min_length": 101.0, - "completions/min_terminated_length": 101.0, + "completions/max_terminated_length": 1971.0, + "completions/mean_length": 1001.0390625, + "completions/mean_terminated_length": 973.7635498046875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, "epoch": 0.8217120423316548, - "grad_norm": 1.5338774919509888, - "kl": 4.79296875, - "learning_rate": 1.8456084869972472e-07, - "loss": 0.3286, - "num_tokens": 1293666113.0, - "reward": 1.916015625, - "reward_std": 0.4454042315483093, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.900390625, - "rewards/format_reward/std": 0.29977133870124817, - "rewards/tag_count_reward/mean": 0.947265625, - "rewards/tag_count_reward/std": 0.16224436461925507, + "grad_norm": 3.1795504093170166, + "kl": 2.609375, + "learning_rate": 1.8462292815433057e-07, + "loss": 0.1654, + "num_tokens": 1365565104.0, + "reward": 1.1298828125, + "reward_std": 0.37695837020874023, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.17378656566143036, "step": 2407 }, { @@ -69818,27 +69818,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1954.0, - "completions/mean_length": 818.033203125, - "completions/mean_terminated_length": 778.3568115234375, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1014.421875, + "completions/mean_terminated_length": 963.590087890625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, "epoch": 0.8220534266450457, - "grad_norm": 1.519821286201477, - "kl": 5.4453125, - "learning_rate": 1.8424826961491852e-07, - "loss": 0.3472, - "num_tokens": 1294152194.0, - "reward": 1.8525390625, - "reward_std": 0.4574953317642212, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.18472495675086975, + "grad_norm": 2.2671878337860107, + "kl": 2.76953125, + "learning_rate": 1.8431012760440028e-07, + "loss": 0.1432, + "num_tokens": 1366151736.0, + "reward": 1.0576171875, + "reward_std": 0.30838334560394287, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.1939898282289505, "step": 2408 }, { @@ -69847,27 +69847,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 853.595703125, - "completions/mean_terminated_length": 810.0748901367188, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1083.91015625, + "completions/mean_terminated_length": 1034.4189453125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, "epoch": 0.8223948109584365, - "grad_norm": 1.4335675239562988, - "kl": 7.453125, - "learning_rate": 1.8393620964779675e-07, - "loss": 0.4401, - "num_tokens": 1294665027.0, - "reward": 1.7783203125, - "reward_std": 0.5727628469467163, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.9052734375, - "rewards/tag_count_reward/std": 0.21798203885555267, + "grad_norm": 1.744873046875, + "kl": 1.783203125, + "learning_rate": 1.8399784647823388e-07, + "loss": 0.0551, + "num_tokens": 1366782490.0, + "reward": 1.11279296875, + "reward_std": 0.4091048538684845, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.087890625, + "rewards/format_reward/std": 0.2834126651287079, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.19979596138000488, "step": 2409 }, { @@ -69876,27 +69876,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1886.0, - "completions/mean_length": 797.01171875, - "completions/mean_terminated_length": 756.6572265625, - "completions/min_length": 71.0, - "completions/min_terminated_length": 71.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1026.7109375, + "completions/mean_terminated_length": 972.0740356445312, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.8227361952718273, - "grad_norm": 1.3071527481079102, - "kl": 6.2421875, - "learning_rate": 1.8362466924127145e-07, - "loss": 0.3913, - "num_tokens": 1295146569.0, - "reward": 1.87890625, - "reward_std": 0.5147675275802612, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.1850479692220688, + "grad_norm": 4.284357070922852, + "kl": 2.298828125, + "learning_rate": 1.8368608521939383e-07, + "loss": 0.0797, + "num_tokens": 1367381638.0, + "reward": 1.0947265625, + "reward_std": 0.39112332463264465, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.9189453125, + "rewards/tag_count_reward/std": 0.1883922666311264, "step": 2410 }, { @@ -69907,25 +69907,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 778.109375, - "completions/mean_terminated_length": 726.48779296875, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1020.359375, + "completions/mean_terminated_length": 978.5853271484375, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, "epoch": 0.8230775795852181, - "grad_norm": 2.4448471069335938, - "kl": 6.609375, - "learning_rate": 1.833136488375171e-07, - "loss": 0.4477, - "num_tokens": 1295620193.0, - "reward": 1.91357421875, - "reward_std": 0.5689276456832886, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.19280590116977692, + "grad_norm": 3.4293127059936523, + "kl": 2.9140625, + "learning_rate": 1.8337484427070406e-07, + "loss": 0.1557, + "num_tokens": 1367979294.0, + "reward": 1.14990234375, + "reward_std": 0.37268367409706116, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.1914980411529541, "step": 2411 }, { @@ -69934,27 +69934,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 819.748046875, - "completions/mean_terminated_length": 792.7804565429688, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1083.80859375, + "completions/mean_terminated_length": 1032.226318359375, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, "epoch": 0.8234189638986089, - "grad_norm": 1.5323444604873657, - "kl": 5.390625, - "learning_rate": 1.8300314887797048e-07, - "loss": 0.3641, - "num_tokens": 1296110048.0, - "reward": 1.951171875, - "reward_std": 0.5348630547523499, - "rewards/accuracy_reward/mean": 0.12109375, - "rewards/accuracy_reward/std": 0.3265552520751953, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.16762074828147888, + "grad_norm": 7.222492218017578, + "kl": 2.8828125, + "learning_rate": 1.8306412407424938e-07, + "loss": 0.1217, + "num_tokens": 1368604348.0, + "reward": 1.19921875, + "reward_std": 0.46285369992256165, + "rewards/accuracy_reward/mean": 0.193359375, + "rewards/accuracy_reward/std": 0.39531853795051575, + "rewards/format_reward/mean": 0.09765625, + "rewards/format_reward/std": 0.29713961482048035, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.20836491882801056, "step": 2412 }, { @@ -69963,27 +69963,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 809.5546875, - "completions/mean_terminated_length": 782.36328125, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1065.822265625, + "completions/mean_terminated_length": 1019.625732421875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.8237603482119996, - "grad_norm": 1.4346827268600464, - "kl": 6.015625, - "learning_rate": 1.8269316980332926e-07, - "loss": 0.3519, - "num_tokens": 1296607004.0, - "reward": 1.87939453125, - "reward_std": 0.5118072032928467, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.18617989122867584, + "grad_norm": 4.418808460235596, + "kl": 2.62109375, + "learning_rate": 1.8275392507137482e-07, + "loss": 0.1407, + "num_tokens": 1369232513.0, + "reward": 1.111328125, + "reward_std": 0.4184204339981079, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.20895110070705414, "step": 2413 }, { @@ -69992,27 +69992,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 877.84375, - "completions/mean_terminated_length": 822.8057250976562, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1125.31640625, + "completions/mean_terminated_length": 1049.2388916015625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.8241017325253904, - "grad_norm": 2.058574914932251, - "kl": 8.0234375, - "learning_rate": 1.823837120535523e-07, - "loss": 0.4802, - "num_tokens": 1297139164.0, - "reward": 1.83935546875, - "reward_std": 0.5372190475463867, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.2066698521375656, + "grad_norm": 4.76790714263916, + "kl": 3.1484375, + "learning_rate": 1.8244424770268547e-07, + "loss": 0.1548, + "num_tokens": 1369891379.0, + "reward": 1.0927734375, + "reward_std": 0.39666956663131714, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.8974609375, + "rewards/tag_count_reward/std": 0.2260729968547821, "step": 2414 }, { @@ -70021,27 +70021,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 830.666015625, - "completions/mean_terminated_length": 788.858642578125, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1059.1640625, + "completions/mean_terminated_length": 1001.9586181640625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.8244431168387812, - "grad_norm": 1.3971267938613892, - "kl": 6.703125, - "learning_rate": 1.820747760678581e-07, - "loss": 0.4367, - "num_tokens": 1297641553.0, - "reward": 1.8408203125, - "reward_std": 0.516970157623291, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.9306640625, - "rewards/tag_count_reward/std": 0.18723225593566895, + "grad_norm": 5.8182783126831055, + "kl": 3.2734375, + "learning_rate": 1.821350924080449e-07, + "loss": 0.1576, + "num_tokens": 1370510759.0, + "reward": 1.06640625, + "reward_std": 0.3936970829963684, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.904296875, + "rewards/tag_count_reward/std": 0.21300913393497467, "step": 2415 }, { @@ -70050,27 +70050,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 806.08984375, - "completions/mean_terminated_length": 791.3636474609375, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1059.669921875, + "completions/mean_terminated_length": 1025.727294921875, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, "epoch": 0.8247845011521721, - "grad_norm": 2.4153144359588623, - "kl": 3.859375, - "learning_rate": 1.8176636228472476e-07, - "loss": 0.2265, - "num_tokens": 1298131327.0, - "reward": 1.9189453125, - "reward_std": 0.43295690417289734, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.9560546875, - "rewards/tag_count_reward/std": 0.14845077693462372, + "grad_norm": 1.6957921981811523, + "kl": 2.09375, + "learning_rate": 1.818264596265758e-07, + "loss": 0.0913, + "num_tokens": 1371130366.0, + "reward": 1.08740234375, + "reward_std": 0.3478482961654663, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.17503775656223297, "step": 2416 }, { @@ -70079,27 +70079,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 811.267578125, - "completions/mean_terminated_length": 771.3729858398438, - "completions/min_length": 76.0, - "completions/min_terminated_length": 76.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1061.90625, + "completions/mean_terminated_length": 1011.2854614257812, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, "epoch": 0.8251258854655629, - "grad_norm": 1.9129817485809326, - "kl": 6.89453125, - "learning_rate": 1.814584711418894e-07, - "loss": 0.4248, - "num_tokens": 1298624424.0, - "reward": 1.830078125, - "reward_std": 0.5324984788894653, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.2025608867406845, + "grad_norm": 6.86099910736084, + "kl": 2.046875, + "learning_rate": 1.8151834979665825e-07, + "loss": 0.1166, + "num_tokens": 1371751790.0, + "reward": 1.07177734375, + "reward_std": 0.347286581993103, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.1843547374010086, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.19614213705062866, "step": 2417 }, { @@ -70108,27 +70108,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1955.0, - "completions/mean_length": 868.361328125, - "completions/mean_terminated_length": 802.6907348632812, - "completions/min_length": 50.0, - "completions/min_terminated_length": 50.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1127.796875, + "completions/mean_terminated_length": 1080.55859375, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, "epoch": 0.8254672697789537, - "grad_norm": 3.6364946365356445, - "kl": 7.828125, - "learning_rate": 1.8115110307634695e-07, - "loss": 0.4445, - "num_tokens": 1299145841.0, - "reward": 1.86328125, - "reward_std": 0.5388745665550232, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.199825257062912, + "grad_norm": 3.594465970993042, + "kl": 2.056640625, + "learning_rate": 1.8121076335592976e-07, + "loss": 0.1043, + "num_tokens": 1372406038.0, + "reward": 1.107421875, + "reward_std": 0.400793194770813, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.2147248089313507, "step": 2418 }, { @@ -70137,27 +70137,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1950.0, - "completions/mean_length": 836.2890625, - "completions/mean_terminated_length": 807.2080688476562, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1080.783203125, + "completions/mean_terminated_length": 1039.41552734375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.8258086540923445, - "grad_norm": 0.7928247451782227, - "kl": 5.90625, - "learning_rate": 1.8084425852435044e-07, - "loss": 0.3592, - "num_tokens": 1299650341.0, - "reward": 1.8720703125, - "reward_std": 0.49951568245887756, - "rewards/accuracy_reward/mean": 0.0729166641831398, - "rewards/accuracy_reward/std": 0.2602709233760834, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.17934982478618622, + "grad_norm": 1.3764461278915405, + "kl": 1.703125, + "learning_rate": 1.809037007412842e-07, + "loss": 0.0812, + "num_tokens": 1373035719.0, + "reward": 1.0986328125, + "reward_std": 0.3274257779121399, + "rewards/accuracy_reward/mean": 0.12291666865348816, + "rewards/accuracy_reward/std": 0.32868409156799316, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.9345703125, + "rewards/tag_count_reward/std": 0.17166221141815186, "step": 2419 }, { @@ -70166,27 +70166,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0078125, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 750.4609375, - "completions/mean_terminated_length": 740.2440795898438, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1038.4296875, + "completions/mean_terminated_length": 971.1250610351562, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.8261500384057353, - "grad_norm": 2.1252243518829346, - "kl": 5.3671875, - "learning_rate": 1.805379379214093e-07, - "loss": 0.3033, - "num_tokens": 1300113073.0, - "reward": 1.9208984375, - "reward_std": 0.4738973379135132, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.9462890625, - "rewards/tag_count_reward/std": 0.16342678666114807, + "grad_norm": 9.349227905273438, + "kl": 2.62890625, + "learning_rate": 1.8059716238887163e-07, + "loss": 0.1868, + "num_tokens": 1373645891.0, + "reward": 1.0654296875, + "reward_std": 0.3468262851238251, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.21315935254096985, "step": 2420 }, { @@ -70195,27 +70195,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 872.5234375, - "completions/mean_terminated_length": 814.7130737304688, - "completions/min_length": 12.0, - "completions/min_terminated_length": 12.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1091.439453125, + "completions/mean_terminated_length": 1050.527587890625, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, "epoch": 0.826491422719126, - "grad_norm": 3.4147696495056152, - "kl": 7.90625, - "learning_rate": 1.802321417022899e-07, - "loss": 0.4664, - "num_tokens": 1300635661.0, - "reward": 1.79150390625, - "reward_std": 0.5270118117332458, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.19631743431091309, + "grad_norm": 1.4730504751205444, + "kl": 1.677734375, + "learning_rate": 1.802911487340972e-07, + "loss": 0.0619, + "num_tokens": 1374280564.0, + "reward": 1.06591796875, + "reward_std": 0.35266730189323425, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.16933852434158325, "step": 2421 }, { @@ -70224,27 +70224,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 908.337890625, - "completions/mean_terminated_length": 869.197998046875, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1225.04296875, + "completions/mean_terminated_length": 1139.909423828125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, "epoch": 0.8268328070325168, - "grad_norm": 0.9701458215713501, - "kl": 6.109375, - "learning_rate": 1.7992687030101388e-07, - "loss": 0.3619, - "num_tokens": 1301183578.0, - "reward": 1.8291015625, - "reward_std": 0.5332648754119873, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.1988353431224823, + "grad_norm": 1.4224796295166016, + "kl": 2.52734375, + "learning_rate": 1.7998566021162088e-07, + "loss": 0.1248, + "num_tokens": 1374990634.0, + "reward": 1.0439453125, + "reward_std": 0.38714104890823364, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.8916015625, + "rewards/tag_count_reward/std": 0.21664533019065857, "step": 2422 }, { @@ -70253,27 +70253,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1910.0, - "completions/mean_length": 765.150390625, - "completions/mean_terminated_length": 749.9387817382812, - "completions/min_length": 114.0, - "completions/min_terminated_length": 114.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1036.66015625, + "completions/mean_terminated_length": 999.8097534179688, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, "epoch": 0.8271741913459076, - "grad_norm": 1.2732343673706055, - "kl": 4.94140625, - "learning_rate": 1.7962212415085804e-07, - "loss": 0.327, - "num_tokens": 1301652647.0, - "reward": 1.9091796875, - "reward_std": 0.4718356728553772, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.17374257743358612, + "grad_norm": 10.13999080657959, + "kl": 2.130859375, + "learning_rate": 1.79680697255357e-07, + "loss": 0.0826, + "num_tokens": 1375598716.0, + "reward": 1.09228515625, + "reward_std": 0.3285040259361267, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17460595071315765, "step": 2423 }, { @@ -70282,27 +70282,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1978.0, - "completions/mean_length": 805.751953125, - "completions/mean_terminated_length": 775.9380493164062, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1055.97265625, + "completions/mean_terminated_length": 1013.5438232421875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, "epoch": 0.8275155756592985, - "grad_norm": 1.3992562294006348, - "kl": 4.984375, - "learning_rate": 1.7931790368435403e-07, - "loss": 0.3079, - "num_tokens": 1302144072.0, - "reward": 1.89111328125, - "reward_std": 0.5150551795959473, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17533229291439056, + "grad_norm": 2.443547487258911, + "kl": 2.8515625, + "learning_rate": 1.7937626029847312e-07, + "loss": 0.111, + "num_tokens": 1376218254.0, + "reward": 1.064453125, + "reward_std": 0.3818660378456116, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.19604040682315826, "step": 2424 }, { @@ -70311,27 +70311,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1947.0, - "completions/mean_length": 795.474609375, - "completions/mean_terminated_length": 778.1129150390625, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1050.142578125, + "completions/mean_terminated_length": 1022.09033203125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, "epoch": 0.8278569599726893, - "grad_norm": 0.9801636934280396, - "kl": 4.6953125, - "learning_rate": 1.7901420933328696e-07, - "loss": 0.2657, - "num_tokens": 1302628315.0, - "reward": 1.9033203125, - "reward_std": 0.46424224972724915, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.9462890625, - "rewards/tag_count_reward/std": 0.15964092314243317, + "grad_norm": 2.5989248752593994, + "kl": 2.087890625, + "learning_rate": 1.7907234977338965e-07, + "loss": 0.1018, + "num_tokens": 1376832887.0, + "reward": 1.13720703125, + "reward_std": 0.34223732352256775, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.16384941339492798, "step": 2425 }, { @@ -70340,27 +70340,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 776.330078125, - "completions/mean_terminated_length": 756.1448974609375, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1056.384765625, + "completions/mean_terminated_length": 996.8468017578125, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, "epoch": 0.8281983442860801, - "grad_norm": 2.3696813583374023, - "kl": 4.9140625, - "learning_rate": 1.787110415286956e-07, - "loss": 0.3149, - "num_tokens": 1303104116.0, - "reward": 1.95068359375, - "reward_std": 0.5256731510162354, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.17570249736309052, + "grad_norm": 6.098804473876953, + "kl": 2.703125, + "learning_rate": 1.7876896611177938e-07, + "loss": 0.1801, + "num_tokens": 1377452076.0, + "reward": 1.14013671875, + "reward_std": 0.35894644260406494, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.3937928080558777, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.90576171875, + "rewards/tag_count_reward/std": 0.2139485478401184, "step": 2426 }, { @@ -70369,27 +70369,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1974.0, - "completions/mean_length": 863.568359375, - "completions/mean_terminated_length": 812.910400390625, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1132.333984375, + "completions/mean_terminated_length": 1073.3201904296875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.8285397285994709, - "grad_norm": 1.6582800149917603, - "kl": 5.95703125, - "learning_rate": 1.784084007008711e-07, - "loss": 0.406, - "num_tokens": 1303623975.0, - "reward": 1.90966796875, - "reward_std": 0.5016525983810425, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.16859769821166992, + "grad_norm": 3.393190622329712, + "kl": 2.419921875, + "learning_rate": 1.7846610974456655e-07, + "loss": 0.1319, + "num_tokens": 1378109543.0, + "reward": 1.1259765625, + "reward_std": 0.4002212584018707, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.19095149636268616, "step": 2427 }, { @@ -70398,27 +70398,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 834.3359375, - "completions/mean_terminated_length": 777.2515258789062, - "completions/min_length": 48.0, - "completions/min_terminated_length": 48.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1058.845703125, + "completions/mean_terminated_length": 999.4555053710938, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.8288811129128617, - "grad_norm": 0.9909442663192749, - "kl": 6.7734375, - "learning_rate": 1.781062872793567e-07, - "loss": 0.4367, - "num_tokens": 1304125779.0, - "reward": 1.86865234375, - "reward_std": 0.5444897413253784, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.19343921542167664, + "grad_norm": 3.487034559249878, + "kl": 2.33984375, + "learning_rate": 1.781637811019267e-07, + "loss": 0.1613, + "num_tokens": 1378726296.0, + "reward": 1.12060546875, + "reward_std": 0.38307666778564453, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.92529296875, + "rewards/tag_count_reward/std": 0.18012800812721252, "step": 2428 }, { @@ -70427,27 +70427,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1979.0, - "completions/mean_length": 765.71484375, - "completions/mean_terminated_length": 750.5098876953125, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 1951.0, + "completions/mean_length": 1005.728515625, + "completions/mean_terminated_length": 961.1507568359375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.8292224972262524, - "grad_norm": 1.1289061307907104, - "kl": 4.28515625, - "learning_rate": 1.778047016929473e-07, - "loss": 0.2583, - "num_tokens": 1304594449.0, - "reward": 1.986328125, - "reward_std": 0.4804825782775879, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.953125, - "rewards/tag_count_reward/std": 0.15483088791370392, + "grad_norm": 2.5854415893554688, + "kl": 2.4453125, + "learning_rate": 1.7786198061328567e-07, + "loss": 0.1384, + "num_tokens": 1379317853.0, + "reward": 1.1708984375, + "reward_std": 0.3726179599761963, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.39980348944664, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.18952499330043793, "step": 2429 }, { @@ -70456,27 +70456,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 811.1171875, - "completions/mean_terminated_length": 791.4841918945312, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1130.703125, + "completions/mean_terminated_length": 1091.4705810546875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, "epoch": 0.8295638815396432, - "grad_norm": 1.3904825448989868, - "kl": 4.9140625, - "learning_rate": 1.7750364436968836e-07, - "loss": 0.3166, - "num_tokens": 1305084429.0, - "reward": 1.89697265625, - "reward_std": 0.521807074546814, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93994140625, - "rewards/tag_count_reward/std": 0.16897699236869812, + "grad_norm": 5.406187057495117, + "kl": 2.208984375, + "learning_rate": 1.7756070870731921e-07, + "loss": 0.1285, + "num_tokens": 1379971461.0, + "reward": 1.11767578125, + "reward_std": 0.3863990902900696, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.2058405727148056, "step": 2430 }, { @@ -70485,27 +70485,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 791.181640625, - "completions/mean_terminated_length": 758.4389038085938, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1017.75390625, + "completions/mean_terminated_length": 984.5201416015625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, "epoch": 0.829905265853034, - "grad_norm": 1.7678790092468262, - "kl": 6.421875, - "learning_rate": 1.7720311573687575e-07, - "loss": 0.4022, - "num_tokens": 1305567786.0, - "reward": 1.92236328125, - "reward_std": 0.5290135145187378, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.1739315241575241, + "grad_norm": 3.6119678020477295, + "kl": 2.130859375, + "learning_rate": 1.7725996581195196e-07, + "loss": 0.125, + "num_tokens": 1380570823.0, + "reward": 1.14501953125, + "reward_std": 0.392635703086853, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.18131764233112335, "step": 2431 }, { @@ -70514,27 +70514,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1937.0, - "completions/mean_length": 828.103515625, - "completions/mean_terminated_length": 788.7520141601562, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1131.615234375, + "completions/mean_terminated_length": 1074.578857421875, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, "epoch": 0.8302466501664248, - "grad_norm": 1.0657150745391846, - "kl": 6.5625, - "learning_rate": 1.769031162210548e-07, - "loss": 0.4132, - "num_tokens": 1306068479.0, - "reward": 1.88134765625, - "reward_std": 0.4663490653038025, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.16893739998340607, + "grad_norm": 4.670883655548096, + "kl": 2.83203125, + "learning_rate": 1.7695975235435765e-07, + "loss": 0.1746, + "num_tokens": 1381226914.0, + "reward": 1.07177734375, + "reward_std": 0.3999711275100708, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.20456935465335846, "step": 2432 }, { @@ -70543,27 +70543,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 847.29296875, - "completions/mean_terminated_length": 793.3836669921875, - "completions/min_length": 71.0, - "completions/min_terminated_length": 71.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1105.158203125, + "completions/mean_terminated_length": 1025.25634765625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, "epoch": 0.8305880344798157, - "grad_norm": 3.5841877460479736, - "kl": 8.984375, - "learning_rate": 1.766036462480201e-07, - "loss": 0.5267, - "num_tokens": 1306582661.0, - "reward": 1.79296875, - "reward_std": 0.5832004547119141, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.91015625, - "rewards/tag_count_reward/std": 0.21154166758060455, + "grad_norm": 1.498756766319275, + "kl": 2.35546875, + "learning_rate": 1.766600687609574e-07, + "loss": 0.1607, + "num_tokens": 1381873123.0, + "reward": 1.0859375, + "reward_std": 0.3785056471824646, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.1949455291032791, "step": 2433 }, { @@ -70572,27 +70572,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 838.97265625, - "completions/mean_terminated_length": 809.9560546875, - "completions/min_length": 115.0, - "completions/min_terminated_length": 115.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1104.72265625, + "completions/mean_terminated_length": 1062.371337890625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.8309294187932065, - "grad_norm": 2.315011978149414, - "kl": 6.375, - "learning_rate": 1.7630470624281442e-07, - "loss": 0.3725, - "num_tokens": 1307092823.0, - "reward": 1.84716796875, - "reward_std": 0.5111981630325317, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.1791812628507614, + "grad_norm": 2.9274964332580566, + "kl": 2.6328125, + "learning_rate": 1.7636091545742038e-07, + "loss": 0.1265, + "num_tokens": 1382519349.0, + "reward": 1.08447265625, + "reward_std": 0.37317654490470886, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.1940557062625885, "step": 2434 }, { @@ -70601,27 +70601,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1958.0, - "completions/mean_length": 872.875, - "completions/mean_terminated_length": 830.0567016601562, - "completions/min_length": 223.0, - "completions/min_terminated_length": 223.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1106.54296875, + "completions/mean_terminated_length": 1056.1768798828125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, "epoch": 0.8312708031065973, - "grad_norm": 2.2868504524230957, - "kl": 7.734375, - "learning_rate": 1.7600629662972832e-07, - "loss": 0.4684, - "num_tokens": 1307613991.0, - "reward": 1.8720703125, - "reward_std": 0.5721770524978638, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.19785255193710327, + "grad_norm": 6.498803615570068, + "kl": 2.22265625, + "learning_rate": 1.7606229286866175e-07, + "loss": 0.0583, + "num_tokens": 1383160155.0, + "reward": 1.134765625, + "reward_std": 0.4322308301925659, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.099609375, + "rewards/format_reward/std": 0.29977133870124817, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.18201786279678345, "step": 2435 }, { @@ -70630,27 +70630,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 820.611328125, - "completions/mean_terminated_length": 778.4586181640625, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1048.37890625, + "completions/mean_terminated_length": 994.9011840820312, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.8316121874199881, - "grad_norm": 2.0491178035736084, - "kl": 5.859375, - "learning_rate": 1.757084178322999e-07, - "loss": 0.3588, - "num_tokens": 1308115648.0, - "reward": 1.861328125, - "reward_std": 0.48040178418159485, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.947265625, - "rewards/tag_count_reward/std": 0.16224436461925507, + "grad_norm": 3.208627462387085, + "kl": 3.23828125, + "learning_rate": 1.757642014188438e-07, + "loss": 0.1899, + "num_tokens": 1383778429.0, + "reward": 1.02783203125, + "reward_std": 0.37499701976776123, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.90673828125, + "rewards/tag_count_reward/std": 0.20564086735248566, "step": 2436 }, { @@ -70659,27 +70659,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 819.5078125, - "completions/mean_terminated_length": 784.9718627929688, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1039.720703125, + "completions/mean_terminated_length": 998.7337036132812, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.8319535717333788, - "grad_norm": 1.4329066276550293, - "kl": 6.2109375, - "learning_rate": 1.754110702733134e-07, - "loss": 0.3724, - "num_tokens": 1308608388.0, - "reward": 1.87744140625, - "reward_std": 0.4910423159599304, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.1781490594148636, + "grad_norm": 3.140491008758545, + "kl": 2.8671875, + "learning_rate": 1.7546664153137359e-07, + "loss": 0.1535, + "num_tokens": 1384383918.0, + "reward": 1.07861328125, + "reward_std": 0.346494197845459, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.1780739426612854, "step": 2437 }, { @@ -70688,27 +70688,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1892.0, - "completions/mean_length": 813.140625, - "completions/mean_terminated_length": 770.7313232421875, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1119.650390625, + "completions/mean_terminated_length": 1036.69140625, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, "epoch": 0.8322949560467696, - "grad_norm": 0.8658393025398254, - "kl": 5.140625, - "learning_rate": 1.7511425437479946e-07, - "loss": 0.3328, - "num_tokens": 1309106924.0, - "reward": 1.90234375, - "reward_std": 0.44942134618759155, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.1623503416776657, + "grad_norm": 6.420150279998779, + "kl": 3.59375, + "learning_rate": 1.7516961362890364e-07, + "loss": 0.1808, + "num_tokens": 1385039387.0, + "reward": 1.08251953125, + "reward_std": 0.3912148177623749, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.90087890625, + "rewards/tag_count_reward/std": 0.2168629914522171, "step": 2438 }, { @@ -70717,27 +70717,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 892.69140625, - "completions/mean_terminated_length": 843.279052734375, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1177.849609375, + "completions/mean_terminated_length": 1114.0020751953125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.8326363403601604, - "grad_norm": 1.0234296321868896, - "kl": 6.0234375, - "learning_rate": 1.7481797055803382e-07, - "loss": 0.3668, - "num_tokens": 1309645566.0, - "reward": 1.87255859375, - "reward_std": 0.49502408504486084, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.18456129729747772, + "grad_norm": 1.724784255027771, + "kl": 2.642578125, + "learning_rate": 1.7487311813333038e-07, + "loss": 0.1272, + "num_tokens": 1385724030.0, + "reward": 1.119140625, + "reward_std": 0.4232575297355652, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.0859375, + "rewards/format_reward/std": 0.28054583072662354, + "rewards/tag_count_reward/mean": 0.892578125, + "rewards/tag_count_reward/std": 0.21656812727451324, "step": 2439 }, { @@ -70746,27 +70746,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 818.84375, - "completions/mean_terminated_length": 781.7464599609375, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1079.634765625, + "completions/mean_terminated_length": 1027.8292236328125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, "epoch": 0.8329777246735512, - "grad_norm": 2.0106232166290283, - "kl": 5.4921875, - "learning_rate": 1.7452221924353733e-07, - "loss": 0.3998, - "num_tokens": 1310140414.0, - "reward": 1.87109375, - "reward_std": 0.45293596386909485, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.1726529598236084, + "grad_norm": 5.704923152923584, + "kl": 3.240234375, + "learning_rate": 1.7457715546579456e-07, + "loss": 0.1441, + "num_tokens": 1386352403.0, + "reward": 1.0791015625, + "reward_std": 0.400590717792511, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.20734204351902008, "step": 2440 }, { @@ -70775,27 +70775,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1815.0, - "completions/mean_length": 756.9453125, - "completions/mean_terminated_length": 723.3106079101562, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1039.5859375, + "completions/mean_terminated_length": 981.2478637695312, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.8333191089869421, - "grad_norm": 2.217083692550659, - "kl": 4.92578125, - "learning_rate": 1.7422700085107485e-07, - "loss": 0.3484, - "num_tokens": 1310609538.0, - "reward": 1.97412109375, - "reward_std": 0.47586676478385925, - "rewards/accuracy_reward/mean": 0.10685484111309052, - "rewards/accuracy_reward/std": 0.3092404901981354, - "rewards/format_reward/mean": 0.9140625, - "rewards/format_reward/std": 0.28054583072662354, - "rewards/tag_count_reward/mean": 0.95654296875, - "rewards/tag_count_reward/std": 0.16008546948432922, + "grad_norm": 2.6870903968811035, + "kl": 3.125, + "learning_rate": 1.7428172604667952e-07, + "loss": 0.1581, + "num_tokens": 1386966239.0, + "reward": 1.17431640625, + "reward_std": 0.45181700587272644, + "rewards/accuracy_reward/mean": 0.18145161867141724, + "rewards/accuracy_reward/std": 0.38578101992607117, + "rewards/format_reward/mean": 0.076171875, + "rewards/format_reward/std": 0.26553234457969666, + "rewards/tag_count_reward/mean": 0.92236328125, + "rewards/tag_count_reward/std": 0.1888602077960968, "step": 2441 }, { @@ -70804,27 +70804,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 802.890625, - "completions/mean_terminated_length": 783.1270141601562, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1063.0390625, + "completions/mean_terminated_length": 1008.2062377929688, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.8336604933003329, - "grad_norm": 1.6382285356521606, - "kl": 4.71875, - "learning_rate": 1.7393231579965467e-07, - "loss": 0.2743, - "num_tokens": 1311097066.0, - "reward": 1.8720703125, - "reward_std": 0.4506915807723999, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.16874286532402039, + "grad_norm": 2.1704299449920654, + "kl": 3.140625, + "learning_rate": 1.7398683029561185e-07, + "loss": 0.1857, + "num_tokens": 1387586963.0, + "reward": 1.08056640625, + "reward_std": 0.34484773874282837, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.1889462023973465, "step": 2442 }, { @@ -70833,27 +70833,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 884.443359375, - "completions/mean_terminated_length": 832.2020263671875, - "completions/min_length": 44.0, - "completions/min_terminated_length": 44.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1165.7265625, + "completions/mean_terminated_length": 1106.908447265625, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, "epoch": 0.8340018776137237, - "grad_norm": 1.4400036334991455, - "kl": 6.08203125, - "learning_rate": 1.736381645075286e-07, - "loss": 0.3848, - "num_tokens": 1311640045.0, - "reward": 1.79443359375, - "reward_std": 0.514877200126648, - "rewards/accuracy_reward/mean": 0.0234375, - "rewards/accuracy_reward/std": 0.15143637359142303, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.20283371210098267, + "grad_norm": 3.793334484100342, + "kl": 3.0234375, + "learning_rate": 1.7369246863145948e-07, + "loss": 0.1297, + "num_tokens": 1388273959.0, + "reward": 1.04931640625, + "reward_std": 0.4144626259803772, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.07421875, + "rewards/format_reward/std": 0.2623828947544098, + "rewards/tag_count_reward/mean": 0.89306640625, + "rewards/tag_count_reward/std": 0.21933400630950928, "step": 2443 }, { @@ -70862,27 +70862,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 858.203125, - "completions/mean_terminated_length": 822.2937622070312, - "completions/min_length": 45.0, - "completions/min_terminated_length": 45.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1106.357421875, + "completions/mean_terminated_length": 1051.8822021484375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, "epoch": 0.8343432619271145, - "grad_norm": 1.732412576675415, - "kl": 6.8828125, - "learning_rate": 1.733445473921904e-07, - "loss": 0.4102, - "num_tokens": 1312155797.0, - "reward": 1.83447265625, - "reward_std": 0.5208526849746704, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.18415184319019318, + "grad_norm": 2.5380656719207764, + "kl": 2.85546875, + "learning_rate": 1.7339864147233225e-07, + "loss": 0.1451, + "num_tokens": 1388916766.0, + "reward": 1.03125, + "reward_std": 0.3855457305908203, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.896484375, + "rewards/tag_count_reward/std": 0.21394017338752747, "step": 2444 }, { @@ -70891,27 +70891,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 802.6953125, - "completions/mean_terminated_length": 752.0731201171875, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1040.208984375, + "completions/mean_terminated_length": 981.906982421875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.8346846462405052, - "grad_norm": 0.8828231692314148, - "kl": 7.34375, - "learning_rate": 1.7305146487037603e-07, - "loss": 0.4933, - "num_tokens": 1312640825.0, - "reward": 1.91259765625, - "reward_std": 0.544389545917511, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.19574224948883057, + "grad_norm": 2.464055061340332, + "kl": 3.203125, + "learning_rate": 1.7310534923558025e-07, + "loss": 0.2021, + "num_tokens": 1389523401.0, + "reward": 1.12451171875, + "reward_std": 0.40834134817123413, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.07421875, + "rewards/format_reward/std": 0.2623828947544098, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.20106884837150574, "step": 2445 }, { @@ -70920,27 +70920,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 851.515625, - "completions/mean_terminated_length": 802.8779907226562, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1109.09765625, + "completions/mean_terminated_length": 1048.5863037109375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.835026030553896, - "grad_norm": 2.0713346004486084, - "kl": 8.328125, - "learning_rate": 1.727589173580625e-07, - "loss": 0.5235, - "num_tokens": 1313149937.0, - "reward": 1.80859375, - "reward_std": 0.5601276159286499, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.19890500605106354, + "grad_norm": 3.8685975074768066, + "kl": 2.771484375, + "learning_rate": 1.7281259233779436e-07, + "loss": 0.1386, + "num_tokens": 1390164395.0, + "reward": 1.0576171875, + "reward_std": 0.34709495306015015, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.8955078125, + "rewards/tag_count_reward/std": 0.2146068513393402, "step": 2446 }, { @@ -70949,27 +70949,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 861.833984375, - "completions/mean_terminated_length": 838.2052001953125, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1113.3203125, + "completions/mean_terminated_length": 1038.38818359375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, "epoch": 0.8353674148672868, - "grad_norm": 0.9768324494361877, - "kl": 5.34765625, - "learning_rate": 1.724669052704673e-07, - "loss": 0.3107, - "num_tokens": 1313666588.0, - "reward": 1.88671875, - "reward_std": 0.5045324563980103, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.17203198373317719, + "grad_norm": 2.6831860542297363, + "kl": 3.25, + "learning_rate": 1.7252037119480438e-07, + "loss": 0.166, + "num_tokens": 1390809807.0, + "reward": 1.033203125, + "reward_std": 0.37386655807495117, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.892578125, + "rewards/tag_count_reward/std": 0.22048601508140564, "step": 2447 }, { @@ -70978,27 +70978,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 779.828125, - "completions/mean_terminated_length": 738.9193115234375, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1033.1484375, + "completions/mean_terminated_length": 960.9623413085938, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.8357087991806776, - "grad_norm": 1.2612099647521973, - "kl": 6.4609375, - "learning_rate": 1.7217542902204847e-07, - "loss": 0.4308, - "num_tokens": 1314136148.0, - "reward": 1.9033203125, - "reward_std": 0.4736611843109131, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.9443359375, - "rewards/tag_count_reward/std": 0.1764710247516632, + "grad_norm": 7.181947708129883, + "kl": 3.1796875, + "learning_rate": 1.7222868622167998e-07, + "loss": 0.1493, + "num_tokens": 1391409067.0, + "reward": 1.09130859375, + "reward_std": 0.4202735424041748, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.90185546875, + "rewards/tag_count_reward/std": 0.21843034029006958, "step": 2448 }, { @@ -71007,27 +71007,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1961.0, - "completions/mean_length": 815.84375, - "completions/mean_terminated_length": 760.5223999023438, - "completions/min_length": 67.0, - "completions/min_terminated_length": 67.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1140.931640625, + "completions/mean_terminated_length": 1033.9847412109375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, "epoch": 0.8360501834940685, - "grad_norm": 0.8281819820404053, - "kl": 6.25, - "learning_rate": 1.7188448902650287e-07, - "loss": 0.3747, - "num_tokens": 1314636884.0, - "reward": 1.8671875, - "reward_std": 0.5257279276847839, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.190873920917511, + "grad_norm": 4.506415843963623, + "kl": 3.64453125, + "learning_rate": 1.7193753783272847e-07, + "loss": 0.2437, + "num_tokens": 1392076248.0, + "reward": 1.07421875, + "reward_std": 0.4136364459991455, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.87890625, + "rewards/tag_count_reward/std": 0.23716437816619873, "step": 2449 }, { @@ -71036,27 +71036,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 798.275390625, - "completions/mean_terminated_length": 765.7174682617188, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1107.666015625, + "completions/mean_terminated_length": 1042.883056640625, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, "epoch": 0.8363915678074593, - "grad_norm": 0.9520702958106995, - "kl": 5.30078125, - "learning_rate": 1.7159408569676704e-07, - "loss": 0.3225, - "num_tokens": 1315122033.0, - "reward": 1.873046875, - "reward_std": 0.49339762330055237, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.18373169004917145, + "grad_norm": 4.927025318145752, + "kl": 2.86328125, + "learning_rate": 1.7164692644149557e-07, + "loss": 0.1557, + "num_tokens": 1392719805.0, + "reward": 1.08154296875, + "reward_std": 0.38479864597320557, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.89990234375, + "rewards/tag_count_reward/std": 0.2059890627861023, "step": 2450 }, { @@ -71065,27 +71065,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1920.0, - "completions/mean_length": 820.03515625, - "completions/mean_terminated_length": 764.9019775390625, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1064.2578125, + "completions/mean_terminated_length": 1005.1925659179688, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, "epoch": 0.8367329521208501, - "grad_norm": 1.0722157955169678, - "kl": 6.51171875, - "learning_rate": 1.71304219445015e-07, - "loss": 0.44, - "num_tokens": 1315613059.0, - "reward": 1.89208984375, - "reward_std": 0.5305795669555664, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.18189117312431335, + "grad_norm": 2.6272780895233154, + "kl": 2.15234375, + "learning_rate": 1.713568524607637e-07, + "loss": 0.1015, + "num_tokens": 1393335873.0, + "reward": 1.11279296875, + "reward_std": 0.3726275563240051, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.91162109375, + "rewards/tag_count_reward/std": 0.19440992176532745, "step": 2451 }, { @@ -71094,27 +71094,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1933.0, - "completions/mean_length": 834.470703125, - "completions/mean_terminated_length": 797.8450317382812, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1174.25, + "completions/mean_terminated_length": 1116.0, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, "epoch": 0.8370743364342409, - "grad_norm": 1.146774172782898, - "kl": 5.04296875, - "learning_rate": 1.7101489068265935e-07, - "loss": 0.3122, - "num_tokens": 1316124084.0, - "reward": 1.91845703125, - "reward_std": 0.46381455659866333, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.94970703125, - "rewards/tag_count_reward/std": 0.16339389979839325, + "grad_norm": 2.45058536529541, + "kl": 2.921875, + "learning_rate": 1.710673163025526e-07, + "loss": 0.1479, + "num_tokens": 1394020865.0, + "reward": 1.10400390625, + "reward_std": 0.43304574489593506, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.08984375, + "rewards/format_reward/std": 0.2862374484539032, + "rewards/tag_count_reward/mean": 0.89111328125, + "rewards/tag_count_reward/std": 0.22280485928058624, "step": 2452 }, { @@ -71123,27 +71123,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 791.599609375, - "completions/mean_terminated_length": 756.2791137695312, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1096.330078125, + "completions/mean_terminated_length": 1037.0975341796875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, "epoch": 0.8374157207476316, - "grad_norm": 2.683335781097412, - "kl": 5.53515625, - "learning_rate": 1.7072609982034874e-07, - "loss": 0.3822, - "num_tokens": 1316606215.0, - "reward": 1.84228515625, - "reward_std": 0.4803961515426636, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.18939071893692017, + "grad_norm": 2.160078525543213, + "kl": 2.0546875, + "learning_rate": 1.707783183781174e-07, + "loss": 0.0922, + "num_tokens": 1394659018.0, + "reward": 1.05859375, + "reward_std": 0.36901023983955383, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.1825525462627411, "step": 2453 }, { @@ -71152,27 +71152,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1946.0, - "completions/mean_length": 807.419921875, - "completions/mean_terminated_length": 780.181640625, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1119.919921875, + "completions/mean_terminated_length": 1072.2772216796875, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, "epoch": 0.8377571050610224, - "grad_norm": 0.8453850150108337, - "kl": 5.09375, - "learning_rate": 1.7043784726796934e-07, - "loss": 0.3165, - "num_tokens": 1317093790.0, - "reward": 1.89599609375, - "reward_std": 0.43774914741516113, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.94287109375, - "rewards/tag_count_reward/std": 0.17070980370044708, + "grad_norm": 2.0535600185394287, + "kl": 2.43359375, + "learning_rate": 1.7048985909794928e-07, + "loss": 0.1273, + "num_tokens": 1395306593.0, + "reward": 1.08349609375, + "reward_std": 0.39434653520584106, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.08984375, + "rewards/format_reward/std": 0.2862374484539032, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.19923560321331024, "step": 2454 }, { @@ -71181,27 +71181,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1880.0, - "completions/mean_length": 817.33984375, - "completions/mean_terminated_length": 769.9107055664062, - "completions/min_length": 124.0, - "completions/min_terminated_length": 124.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1159.25, + "completions/mean_terminated_length": 1071.519287109375, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, "epoch": 0.8380984893744132, - "grad_norm": 0.8198954463005066, - "kl": 6.859375, - "learning_rate": 1.7015013343464302e-07, - "loss": 0.4087, - "num_tokens": 1317593932.0, - "reward": 1.8173828125, - "reward_std": 0.5331565141677856, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.1941080093383789, + "grad_norm": 3.17631196975708, + "kl": 2.921875, + "learning_rate": 1.7020193887177403e-07, + "loss": 0.1725, + "num_tokens": 1395981793.0, + "reward": 1.03173828125, + "reward_std": 0.38581639528274536, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.88134765625, + "rewards/tag_count_reward/std": 0.22817394137382507, "step": 2455 }, { @@ -71210,27 +71210,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 722.31640625, - "completions/mean_terminated_length": 698.5963745117188, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1000.462890625, + "completions/mean_terminated_length": 955.659912109375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, "epoch": 0.838439873687804, - "grad_norm": 1.1952383518218994, - "kl": 4.9921875, - "learning_rate": 1.698629587287266e-07, - "loss": 0.2944, - "num_tokens": 1318038894.0, - "reward": 1.93896484375, - "reward_std": 0.4384310245513916, - "rewards/accuracy_reward/mean": 0.0786290317773819, - "rewards/accuracy_reward/std": 0.26943066716194153, - "rewards/format_reward/mean": 0.912109375, - "rewards/format_reward/std": 0.2834126651287079, - "rewards/tag_count_reward/mean": 0.95068359375, - "rewards/tag_count_reward/std": 0.15991228818893433, + "grad_norm": 2.7116317749023438, + "kl": 3.453125, + "learning_rate": 1.6991455810855204e-07, + "loss": 0.2009, + "num_tokens": 1396569166.0, + "reward": 1.08203125, + "reward_std": 0.3808964490890503, + "rewards/accuracy_reward/mean": 0.12096773833036423, + "rewards/accuracy_reward/std": 0.32641899585723877, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.20718760788440704, "step": 2456 }, { @@ -71239,27 +71239,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 754.697265625, - "completions/mean_terminated_length": 726.3013916015625, - "completions/min_length": 8.0, - "completions/min_terminated_length": 8.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1003.875, + "completions/mean_terminated_length": 965.8299560546875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.8387812580011949, - "grad_norm": 1.8518648147583008, - "kl": 5.53125, - "learning_rate": 1.6957632355781243e-07, - "loss": 0.2906, - "num_tokens": 1318494899.0, - "reward": 1.955078125, - "reward_std": 0.4595924913883209, - "rewards/accuracy_reward/mean": 0.134765625, - "rewards/accuracy_reward/std": 0.3418070077896118, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.17411890625953674, + "grad_norm": 3.508617877960205, + "kl": 2.083984375, + "learning_rate": 1.6962771721647705e-07, + "loss": 0.1221, + "num_tokens": 1397152750.0, + "reward": 1.15771484375, + "reward_std": 0.3813819885253906, + "rewards/accuracy_reward/mean": 0.193359375, + "rewards/accuracy_reward/std": 0.39531853795051575, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.18335099518299103, "step": 2457 }, { @@ -71268,27 +71268,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1908.0, - "completions/mean_length": 794.3828125, - "completions/mean_terminated_length": 761.7234497070312, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1145.21484375, + "completions/mean_terminated_length": 1085.0291748046875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.8391226423145857, - "grad_norm": 1.0499241352081299, - "kl": 5.875, - "learning_rate": 1.6929022832872653e-07, - "loss": 0.3696, - "num_tokens": 1318979223.0, - "reward": 1.9111328125, - "reward_std": 0.4973984956741333, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.17514485120773315, + "grad_norm": 4.807103633880615, + "kl": 2.234375, + "learning_rate": 1.693414166029764e-07, + "loss": 0.0903, + "num_tokens": 1397816700.0, + "reward": 1.1318359375, + "reward_std": 0.4273075759410858, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.080078125, + "rewards/format_reward/std": 0.271679550409317, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.20634421706199646, "step": 2458 }, { @@ -71297,27 +71297,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1803.0, - "completions/mean_length": 747.6796875, - "completions/mean_terminated_length": 724.4135131835938, - "completions/min_length": 13.0, - "completions/min_terminated_length": 13.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1069.015625, + "completions/mean_terminated_length": 1035.39404296875, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, "epoch": 0.8394640266279765, - "grad_norm": 0.9654370546340942, - "kl": 5.171875, - "learning_rate": 1.6900467344752872e-07, - "loss": 0.2651, - "num_tokens": 1319440211.0, - "reward": 1.9091796875, - "reward_std": 0.45924073457717896, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.9521484375, - "rewards/tag_count_reward/std": 0.15133222937583923, + "grad_norm": 4.118847846984863, + "kl": 2.037109375, + "learning_rate": 1.6905565667470954e-07, + "loss": 0.0873, + "num_tokens": 1398442212.0, + "reward": 1.134765625, + "reward_std": 0.3714521527290344, + "rewards/accuracy_reward/mean": 0.14717741310596466, + "rewards/accuracy_reward/std": 0.3546403646469116, + "rewards/format_reward/mean": 0.07421875, + "rewards/format_reward/std": 0.2623828947544098, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.18796826899051666, "step": 2459 }, { @@ -71326,27 +71326,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1948.0, - "completions/mean_length": 790.044921875, - "completions/mean_terminated_length": 757.2725830078125, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1090.4375, + "completions/mean_terminated_length": 1061.5372314453125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.8398054109413673, - "grad_norm": 0.9977067708969116, - "kl": 6.08984375, - "learning_rate": 1.6871965931951178e-07, - "loss": 0.3605, - "num_tokens": 1319916170.0, - "reward": 1.85400390625, - "reward_std": 0.4964134097099304, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.19136326014995575, + "grad_norm": 1.8655813932418823, + "kl": 1.4052734375, + "learning_rate": 1.6877043783756838e-07, + "loss": 0.0529, + "num_tokens": 1399071972.0, + "reward": 1.103515625, + "reward_std": 0.3832892179489136, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.1781550794839859, "step": 2460 }, { @@ -71355,27 +71355,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1907.0, - "completions/mean_length": 755.177734375, - "completions/mean_terminated_length": 716.158935546875, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1036.630859375, + "completions/mean_terminated_length": 973.6826171875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.840146795254758, - "grad_norm": 1.0653966665267944, - "kl": 7.1015625, - "learning_rate": 1.684351863492014e-07, - "loss": 0.4232, - "num_tokens": 1320371173.0, - "reward": 1.837890625, - "reward_std": 0.6131343841552734, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.2011692225933075, + "grad_norm": 1.6605829000473022, + "kl": 2.90625, + "learning_rate": 1.6848576049667605e-07, + "loss": 0.1449, + "num_tokens": 1399671079.0, + "reward": 1.1103515625, + "reward_std": 0.3776431083679199, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.9189453125, + "rewards/tag_count_reward/std": 0.1909715086221695, "step": 2461 }, { @@ -71384,27 +71384,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 812.6015625, - "completions/mean_terminated_length": 777.8714599609375, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1134.431640625, + "completions/mean_terminated_length": 1048.5406494140625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.8404881795681488, - "grad_norm": 0.7508025169372559, - "kl": 5.80859375, - "learning_rate": 1.6815125494035494e-07, - "loss": 0.379, - "num_tokens": 1320861273.0, - "reward": 1.857421875, - "reward_std": 0.42223262786865234, - "rewards/accuracy_reward/mean": 0.02822580561041832, - "rewards/accuracy_reward/std": 0.1657845675945282, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.18302303552627563, + "grad_norm": 2.485846519470215, + "kl": 2.93359375, + "learning_rate": 1.6820162505638675e-07, + "loss": 0.1791, + "num_tokens": 1400325956.0, + "reward": 1.05078125, + "reward_std": 0.3807339370250702, + "rewards/accuracy_reward/mean": 0.06653226166963577, + "rewards/accuracy_reward/std": 0.24946178495883942, + "rewards/format_reward/mean": 0.083984375, + "rewards/format_reward/std": 0.2776356339454651, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.2080436646938324, "step": 2462 }, { @@ -71413,27 +71413,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1952.0, - "completions/mean_length": 814.798828125, - "completions/mean_terminated_length": 782.6713256835938, - "completions/min_length": 39.0, - "completions/min_terminated_length": 39.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1139.251953125, + "completions/mean_terminated_length": 1060.1466064453125, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, "epoch": 0.8408295638815396, - "grad_norm": 1.774247169494629, - "kl": 4.22265625, - "learning_rate": 1.678678654959609e-07, - "loss": 0.2615, - "num_tokens": 1321359538.0, - "reward": 1.869140625, - "reward_std": 0.48968732357025146, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.18373169004917145, + "grad_norm": 2.64872670173645, + "kl": 2.9140625, + "learning_rate": 1.6791803192028458e-07, + "loss": 0.1612, + "num_tokens": 1400990341.0, + "reward": 1.0322265625, + "reward_std": 0.40245115756988525, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.8798828125, + "rewards/tag_count_reward/std": 0.23298367857933044, "step": 2463 }, { @@ -71442,27 +71442,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 772.46484375, - "completions/mean_terminated_length": 736.6063842773438, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1013.56640625, + "completions/mean_terminated_length": 960.464111328125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.8411709481949304, - "grad_norm": 3.33648943901062, - "kl": 5.1328125, - "learning_rate": 1.6758501841823902e-07, - "loss": 0.3516, - "num_tokens": 1321830304.0, - "reward": 1.8837890625, - "reward_std": 0.48854079842567444, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.17866657674312592, + "grad_norm": 1.9933552742004395, + "kl": 2.66015625, + "learning_rate": 1.676349814911837e-07, + "loss": 0.1399, + "num_tokens": 1401584551.0, + "reward": 1.03271484375, + "reward_std": 0.32293686270713806, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.1983945220708847, "step": 2464 }, { @@ -71471,27 +71471,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 838.130859375, - "completions/mean_terminated_length": 788.9491577148438, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1114.7109375, + "completions/mean_terminated_length": 1044.1260986328125, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, "epoch": 0.8415123325083212, - "grad_norm": 0.8046271204948425, - "kl": 5.984375, - "learning_rate": 1.6730271410863864e-07, - "loss": 0.3562, - "num_tokens": 1322348707.0, - "reward": 1.83251953125, - "reward_std": 0.5146738290786743, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.19859671592712402, + "grad_norm": 5.01402473449707, + "kl": 3.44921875, + "learning_rate": 1.6735247417112718e-07, + "loss": 0.1805, + "num_tokens": 1402244563.0, + "reward": 1.0732421875, + "reward_std": 0.43712496757507324, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.30236753821372986, + "rewards/tag_count_reward/mean": 0.8935546875, + "rewards/tag_count_reward/std": 0.22261446714401245, "step": 2465 }, { @@ -71500,27 +71500,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 772.365234375, - "completions/mean_terminated_length": 739.1322631835938, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1133.181640625, + "completions/mean_terminated_length": 1059.8416748046875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.8418537168217121, - "grad_norm": 1.6727104187011719, - "kl": 3.3203125, - "learning_rate": 1.6702095296783942e-07, - "loss": 0.2213, - "num_tokens": 1322823614.0, - "reward": 1.951171875, - "reward_std": 0.41594749689102173, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.916015625, - "rewards/format_reward/std": 0.2776356339454651, - "rewards/tag_count_reward/mean": 0.958984375, - "rewards/tag_count_reward/std": 0.14764074981212616, + "grad_norm": 5.2469048500061035, + "kl": 2.443359375, + "learning_rate": 1.6707051036138687e-07, + "loss": 0.1517, + "num_tokens": 1402904208.0, + "reward": 1.05078125, + "reward_std": 0.37223637104034424, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.20925270020961761, "step": 2466 }, { @@ -71529,27 +71529,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 803.171875, - "completions/mean_terminated_length": 778.37451171875, - "completions/min_length": 72.0, - "completions/min_terminated_length": 72.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1114.916015625, + "completions/mean_terminated_length": 1075.0081787109375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.8421951011351029, - "grad_norm": 1.9509830474853516, - "kl": 3.86328125, - "learning_rate": 1.6673973539574953e-07, - "loss": 0.2425, - "num_tokens": 1323309702.0, - "reward": 1.9228515625, - "reward_std": 0.433314710855484, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.90234375, - "rewards/format_reward/std": 0.29713961482048035, - "rewards/tag_count_reward/mean": 0.9521484375, - "rewards/tag_count_reward/std": 0.1576654314994812, + "grad_norm": 11.170723915100098, + "kl": 2.71875, + "learning_rate": 1.6678909046246247e-07, + "loss": 0.0937, + "num_tokens": 1403549909.0, + "reward": 1.10205078125, + "reward_std": 0.4038926362991333, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29176566004753113, + "rewards/tag_count_reward/mean": 0.91455078125, + "rewards/tag_count_reward/std": 0.2036799192428589, "step": 2467 }, { @@ -71558,27 +71558,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1876.0, - "completions/mean_length": 851.47265625, - "completions/mean_terminated_length": 807.87451171875, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1165.169921875, + "completions/mean_terminated_length": 1075.9376220703125, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, "epoch": 0.8425364854484937, - "grad_norm": 1.1834297180175781, - "kl": 6.5078125, - "learning_rate": 1.6645906179150592e-07, - "loss": 0.3857, - "num_tokens": 1323829128.0, - "reward": 1.8564453125, - "reward_std": 0.5604408979415894, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.19524674117565155, + "grad_norm": 2.3902857303619385, + "kl": 2.583984375, + "learning_rate": 1.6650821487408128e-07, + "loss": 0.1381, + "num_tokens": 1404229948.0, + "reward": 1.06103515625, + "reward_std": 0.4081980884075165, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.90087890625, + "rewards/tag_count_reward/std": 0.21056769788265228, "step": 2468 }, { @@ -71587,27 +71587,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 800.455078125, - "completions/mean_terminated_length": 762.8027954101562, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1112.4453125, + "completions/mean_terminated_length": 1052.149658203125, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, "epoch": 0.8428778697618844, - "grad_norm": 1.1407212018966675, - "kl": 6.34765625, - "learning_rate": 1.661789325534737e-07, - "loss": 0.4454, - "num_tokens": 1324316465.0, - "reward": 1.904296875, - "reward_std": 0.48995035886764526, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.18171313405036926, + "grad_norm": 1.9006119966506958, + "kl": 1.9951171875, + "learning_rate": 1.6622788399519722e-07, + "loss": 0.1029, + "num_tokens": 1404877024.0, + "reward": 1.1025390625, + "reward_std": 0.38338834047317505, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.1867826133966446, "step": 2469 }, { @@ -71616,27 +71616,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1890.0, - "completions/mean_length": 773.349609375, - "completions/mean_terminated_length": 707.9158325195312, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1055.48828125, + "completions/mean_terminated_length": 1015.1422119140625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, "epoch": 0.8432192540752752, - "grad_norm": 1.5480372905731201, - "kl": 7.921875, - "learning_rate": 1.6589934807924482e-07, - "loss": 0.5114, - "num_tokens": 1324782356.0, - "reward": 1.90087890625, - "reward_std": 0.6045435667037964, - "rewards/accuracy_reward/mean": 0.12298387289047241, - "rewards/accuracy_reward/std": 0.32875028252601624, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.1979798823595047, + "grad_norm": 2.8661036491394043, + "kl": 2.90625, + "learning_rate": 1.6594809822399073e-07, + "loss": 0.1701, + "num_tokens": 1405487370.0, + "reward": 1.17724609375, + "reward_std": 0.46351733803749084, + "rewards/accuracy_reward/mean": 0.17943547666072845, + "rewards/accuracy_reward/std": 0.3841039538383484, + "rewards/format_reward/mean": 0.083984375, + "rewards/format_reward/std": 0.2776356339454651, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.1914980411529541, "step": 2470 }, { @@ -71645,27 +71645,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 728.19921875, - "completions/mean_terminated_length": 707.2500610351562, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1017.552734375, + "completions/mean_terminated_length": 932.58984375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.843560638388666, - "grad_norm": 1.6724046468734741, - "kl": 6.765625, - "learning_rate": 1.6562030876563843e-07, - "loss": 0.375, - "num_tokens": 1325229978.0, - "reward": 1.828125, - "reward_std": 0.5169737339019775, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.18546061217784882, + "grad_norm": 10.830955505371094, + "kl": 3.6875, + "learning_rate": 1.6566885795786775e-07, + "loss": 0.2614, + "num_tokens": 1406083141.0, + "reward": 1.044921875, + "reward_std": 0.3794099986553192, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.22362731397151947, "step": 2471 }, { @@ -71674,27 +71674,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1889.0, - "completions/mean_length": 785.814453125, - "completions/mean_terminated_length": 742.4666748046875, - "completions/min_length": 83.0, - "completions/min_terminated_length": 83.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1022.1953125, + "completions/mean_terminated_length": 967.3168334960938, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.8439020227020568, - "grad_norm": 2.92116641998291, - "kl": 7.4765625, - "learning_rate": 1.653418150086996e-07, - "loss": 0.4605, - "num_tokens": 1325707051.0, - "reward": 1.8876953125, - "reward_std": 0.48363927006721497, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.17934982478618622, + "grad_norm": 3.130136728286743, + "kl": 2.75, + "learning_rate": 1.653901635934596e-07, + "loss": 0.1494, + "num_tokens": 1406681241.0, + "reward": 1.0986328125, + "reward_std": 0.3787572979927063, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.19458001852035522, "step": 2472 }, { @@ -71703,27 +71703,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 796.6875, - "completions/mean_terminated_length": 740.506103515625, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1096.923828125, + "completions/mean_terminated_length": 1018.5052490234375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.8442434070154476, - "grad_norm": 1.2109431028366089, - "kl": 7.7734375, - "learning_rate": 1.6506386720369953e-07, - "loss": 0.5195, - "num_tokens": 1326190651.0, - "reward": 1.89208984375, - "reward_std": 0.49778905510902405, - "rewards/accuracy_reward/mean": 0.10080645233392715, - "rewards/accuracy_reward/std": 0.30137622356414795, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.19343921542167664, + "grad_norm": 2.968031167984009, + "kl": 2.20703125, + "learning_rate": 1.6511201552662212e-07, + "loss": 0.1407, + "num_tokens": 1407318562.0, + "reward": 1.138671875, + "reward_std": 0.4225820302963257, + "rewards/accuracy_reward/mean": 0.14516128599643707, + "rewards/accuracy_reward/std": 0.3526190221309662, + "rewards/format_reward/mean": 0.087890625, + "rewards/format_reward/std": 0.2834126651287079, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.2032860517501831, "step": 2473 }, { @@ -71732,27 +71732,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 802.771484375, - "completions/mean_terminated_length": 744.2024536132812, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1059.109375, + "completions/mean_terminated_length": 1010.475341796875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.8445847913288385, - "grad_norm": 1.1114060878753662, - "kl": 7.3125, - "learning_rate": 1.6478646574513409e-07, - "loss": 0.4733, - "num_tokens": 1326674310.0, - "reward": 1.93017578125, - "reward_std": 0.6183890104293823, - "rewards/accuracy_reward/mean": 0.162109375, - "rewards/accuracy_reward/std": 0.3689115643501282, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.1941147744655609, + "grad_norm": 2.307568311691284, + "kl": 2.45703125, + "learning_rate": 1.6483441415243538e-07, + "loss": 0.1443, + "num_tokens": 1407933466.0, + "reward": 1.240234375, + "reward_std": 0.4436051845550537, + "rewards/accuracy_reward/mean": 0.240234375, + "rewards/accuracy_reward/std": 0.4276435375213623, + "rewards/format_reward/mean": 0.083984375, + "rewards/format_reward/std": 0.2776356339454651, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.20036010444164276, "step": 2474 }, { @@ -71761,27 +71761,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 820.759765625, - "completions/mean_terminated_length": 778.6121826171875, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1081.90625, + "completions/mean_terminated_length": 1011.0188598632812, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.8449261756422293, - "grad_norm": 1.6558356285095215, - "kl": 5.71875, - "learning_rate": 1.6450961102672394e-07, - "loss": 0.3816, - "num_tokens": 1327182155.0, - "reward": 1.89111328125, - "reward_std": 0.4347790479660034, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.94775390625, - "rewards/tag_count_reward/std": 0.16795603930950165, + "grad_norm": 1.897137999534607, + "kl": 3.083984375, + "learning_rate": 1.645573598652025e-07, + "loss": 0.1725, + "num_tokens": 1408575018.0, + "reward": 1.0595703125, + "reward_std": 0.3842903971672058, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.076171875, + "rewards/format_reward/std": 0.26553234457969666, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.2043152004480362, "step": 2475 }, { @@ -71790,27 +71790,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 789.876953125, - "completions/mean_terminated_length": 762.2534790039062, - "completions/min_length": 86.0, - "completions/min_terminated_length": 86.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1071.18359375, + "completions/mean_terminated_length": 983.8936157226562, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, "epoch": 0.8452675599556201, - "grad_norm": 1.3808722496032715, - "kl": 4.55078125, - "learning_rate": 1.6423330344141401e-07, - "loss": 0.2757, - "num_tokens": 1327672668.0, - "reward": 1.93115234375, - "reward_std": 0.5451623201370239, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.1655435413122177, + "grad_norm": 6.261782169342041, + "kl": 3.28125, + "learning_rate": 1.6428085305844997e-07, + "loss": 0.2415, + "num_tokens": 1409209560.0, + "reward": 1.107421875, + "reward_std": 0.41458624601364136, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.904296875, + "rewards/tag_count_reward/std": 0.21300913393497467, "step": 2476 }, { @@ -71819,27 +71819,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 871.990234375, - "completions/mean_terminated_length": 819.1897583007812, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1102.306640625, + "completions/mean_terminated_length": 1053.759765625, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, "epoch": 0.8456089442690108, - "grad_norm": 2.074889898300171, - "kl": 6.6328125, - "learning_rate": 1.6395754338137203e-07, - "loss": 0.4639, - "num_tokens": 1328196599.0, - "reward": 1.83984375, - "reward_std": 0.562311053276062, - "rewards/accuracy_reward/mean": 0.06854838877916336, - "rewards/accuracy_reward/std": 0.25293973088264465, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.2010456770658493, + "grad_norm": 2.166699171066284, + "kl": 2.3125, + "learning_rate": 1.6400489412492625e-07, + "loss": 0.1307, + "num_tokens": 1409851413.0, + "reward": 1.05908203125, + "reward_std": 0.34913110733032227, + "rewards/accuracy_reward/mean": 0.07056451588869095, + "rewards/accuracy_reward/std": 0.25635460019111633, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.92236328125, + "rewards/tag_count_reward/std": 0.18625172972679138, "step": 2477 }, { @@ -71848,27 +71848,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 813.85546875, - "completions/mean_terminated_length": 768.8866577148438, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1065.029296875, + "completions/mean_terminated_length": 986.2257080078125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.8459503285824016, - "grad_norm": 1.4364606142044067, - "kl": 6.109375, - "learning_rate": 1.6368233123798913e-07, - "loss": 0.4024, - "num_tokens": 1328692173.0, - "reward": 1.87109375, - "reward_std": 0.48487743735313416, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.18571804463863373, + "grad_norm": 11.078619003295898, + "kl": 3.5546875, + "learning_rate": 1.6372948345660187e-07, + "loss": 0.2928, + "num_tokens": 1410475588.0, + "reward": 1.02392578125, + "reward_std": 0.3407014012336731, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.1939331740140915, + "rewards/tag_count_reward/mean": 0.91064453125, + "rewards/tag_count_reward/std": 0.2085477113723755, "step": 2478 }, { @@ -71877,27 +71877,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 812.30859375, - "completions/mean_terminated_length": 775.0140380859375, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1050.736328125, + "completions/mean_terminated_length": 1005.961181640625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, "epoch": 0.8462917128957924, - "grad_norm": 1.5719364881515503, - "kl": 6.1171875, - "learning_rate": 1.634076674018785e-07, - "loss": 0.3933, - "num_tokens": 1329174587.0, - "reward": 1.9228515625, - "reward_std": 0.5373904705047607, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.17374257743358612, + "grad_norm": 4.527271270751953, + "kl": 2.48046875, + "learning_rate": 1.6345462144466864e-07, + "loss": 0.1184, + "num_tokens": 1411080077.0, + "reward": 1.14599609375, + "reward_std": 0.4120045602321625, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.18786084651947021, "step": 2479 }, { @@ -71906,27 +71906,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 839.46484375, - "completions/mean_terminated_length": 812.9301147460938, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1079.623046875, + "completions/mean_terminated_length": 1031.9979248046875, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, "epoch": 0.8466330972091832, - "grad_norm": 0.9839436411857605, - "kl": 4.65234375, - "learning_rate": 1.6313355226287535e-07, - "loss": 0.2931, - "num_tokens": 1329679961.0, - "reward": 1.90185546875, - "reward_std": 0.4618559777736664, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.94677734375, - "rewards/tag_count_reward/std": 0.16321250796318054, + "grad_norm": 2.328171730041504, + "kl": 2.21875, + "learning_rate": 1.6318030847953896e-07, + "loss": 0.1184, + "num_tokens": 1411708412.0, + "reward": 1.11083984375, + "reward_std": 0.3626022934913635, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.93310546875, + "rewards/tag_count_reward/std": 0.17637556791305542, "step": 2480 }, { @@ -71935,27 +71935,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 814.62890625, - "completions/mean_terminated_length": 777.4044189453125, - "completions/min_length": 42.0, - "completions/min_terminated_length": 42.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1080.228515625, + "completions/mean_terminated_length": 1026.3526611328125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, "epoch": 0.846974481522574, - "grad_norm": 0.9385043382644653, - "kl": 4.55078125, - "learning_rate": 1.6285998621003581e-07, - "loss": 0.2869, - "num_tokens": 1330174619.0, - "reward": 1.89208984375, - "reward_std": 0.4626353085041046, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.18718823790550232, + "grad_norm": 3.991018295288086, + "kl": 2.23046875, + "learning_rate": 1.6290654495084523e-07, + "loss": 0.0846, + "num_tokens": 1412339057.0, + "reward": 1.1220703125, + "reward_std": 0.3936161994934082, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.09765625, + "rewards/format_reward/std": 0.29713961482048035, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.19620363414287567, "step": 2481 }, { @@ -71964,27 +71964,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1863.0, - "completions/mean_length": 823.2890625, - "completions/mean_terminated_length": 776.0892333984375, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1085.794921875, + "completions/mean_terminated_length": 1046.680908203125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, "epoch": 0.8473158658359649, - "grad_norm": 2.317014455795288, - "kl": 6.4921875, - "learning_rate": 1.6258696963163704e-07, - "loss": 0.4277, - "num_tokens": 1330675295.0, - "reward": 1.8251953125, - "reward_std": 0.52950119972229, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.19746580719947815, + "grad_norm": 5.698953151702881, + "kl": 3.13671875, + "learning_rate": 1.6263333124743972e-07, + "loss": 0.1505, + "num_tokens": 1412974136.0, + "reward": 1.02490234375, + "reward_std": 0.3475743532180786, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15143637359142303, + "rewards/format_reward/mean": 0.083984375, + "rewards/format_reward/std": 0.2776356339454651, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.19635149836540222, "step": 2482 }, { @@ -71993,27 +71993,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1962.0, - "completions/mean_length": 872.4296875, - "completions/mean_terminated_length": 856.1347045898438, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1150.109375, + "completions/mean_terminated_length": 1098.165283203125, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, "epoch": 0.8476572501493557, - "grad_norm": 1.13470458984375, - "kl": 4.11328125, - "learning_rate": 1.6231450291517617e-07, - "loss": 0.2317, - "num_tokens": 1331204923.0, - "reward": 1.9345703125, - "reward_std": 0.4377116560935974, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.900390625, - "rewards/format_reward/std": 0.29977133870124817, - "rewards/tag_count_reward/mean": 0.9521484375, - "rewards/tag_count_reward/std": 0.1576654314994812, + "grad_norm": 2.203179359436035, + "kl": 2.31640625, + "learning_rate": 1.6236066775739343e-07, + "loss": 0.1291, + "num_tokens": 1413645936.0, + "reward": 1.14453125, + "reward_std": 0.38081881403923035, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.19170314073562622, "step": 2483 }, { @@ -72022,27 +72022,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1879.0, - "completions/mean_length": 823.8828125, - "completions/mean_terminated_length": 776.7058715820312, - "completions/min_length": 88.0, - "completions/min_terminated_length": 88.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1117.328125, + "completions/mean_terminated_length": 1053.2109375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, "epoch": 0.8479986344627465, - "grad_norm": 1.4106740951538086, - "kl": 6.3828125, - "learning_rate": 1.6204258644736966e-07, - "loss": 0.3612, - "num_tokens": 1331706143.0, - "reward": 1.89404296875, - "reward_std": 0.5347704887390137, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.19288021326065063, + "grad_norm": 1.9850155115127563, + "kl": 2.4765625, + "learning_rate": 1.6208855486799602e-07, + "loss": 0.1277, + "num_tokens": 1414297400.0, + "reward": 1.09912109375, + "reward_std": 0.4083441197872162, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.89794921875, + "rewards/tag_count_reward/std": 0.21492886543273926, "step": 2484 }, { @@ -72051,27 +72051,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, - "completions/mean_length": 794.443359375, - "completions/mean_terminated_length": 772.0138549804688, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/mean_length": 1087.8359375, + "completions/mean_terminated_length": 1038.5462646484375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, "epoch": 0.8483400187761372, - "grad_norm": 1.2902165651321411, - "kl": 4.42578125, - "learning_rate": 1.6177122061415337e-07, - "loss": 0.2628, - "num_tokens": 1332190850.0, - "reward": 1.943359375, - "reward_std": 0.4758620262145996, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.951171875, - "rewards/tag_count_reward/std": 0.15342977643013, + "grad_norm": 2.6356756687164307, + "kl": 2.484375, + "learning_rate": 1.6181699296575515e-07, + "loss": 0.1055, + "num_tokens": 1414932324.0, + "reward": 1.1005859375, + "reward_std": 0.43871521949768066, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.9091796875, + "rewards/tag_count_reward/std": 0.20405313372612, "step": 2485 }, { @@ -72080,27 +72080,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 803.541015625, - "completions/mean_terminated_length": 778.7510375976562, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1041.1953125, + "completions/mean_terminated_length": 998.1344604492188, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, "epoch": 0.848681403089528, - "grad_norm": 1.1515048742294312, - "kl": 4.984375, - "learning_rate": 1.615004058006812e-07, - "loss": 0.2792, - "num_tokens": 1332673479.0, - "reward": 1.91943359375, - "reward_std": 0.5097028017044067, - "rewards/accuracy_reward/mean": 0.09677419066429138, - "rewards/accuracy_reward/std": 0.2959485352039337, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.94287109375, - "rewards/tag_count_reward/std": 0.1749558448791504, + "grad_norm": 1.6693556308746338, + "kl": 2.73828125, + "learning_rate": 1.6154598243639582e-07, + "loss": 0.1844, + "num_tokens": 1415536632.0, + "reward": 1.08447265625, + "reward_std": 0.3561251759529114, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310528099536896, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.92236328125, + "rewards/tag_count_reward/std": 0.19270674884319305, "step": 2486 }, { @@ -72109,27 +72109,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1709.0, - "completions/mean_length": 782.19140625, - "completions/mean_terminated_length": 749.2144165039062, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1096.921875, + "completions/mean_terminated_length": 1029.27197265625, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, "epoch": 0.8490227874029188, - "grad_norm": 1.8417575359344482, - "kl": 6.6484375, - "learning_rate": 1.6123014239132568e-07, - "loss": 0.3942, - "num_tokens": 1333159577.0, - "reward": 1.93701171875, - "reward_std": 0.5437750816345215, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.17433211207389832, + "grad_norm": 7.149250030517578, + "kl": 2.5625, + "learning_rate": 1.6127552366485957e-07, + "loss": 0.1022, + "num_tokens": 1416183872.0, + "reward": 1.15478515625, + "reward_std": 0.4297928810119629, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.095703125, + "rewards/format_reward/std": 0.2944713830947876, + "rewards/tag_count_reward/mean": 0.90869140625, + "rewards/tag_count_reward/std": 0.2106221467256546, "step": 2487 }, { @@ -72138,27 +72138,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1965.0, - "completions/mean_length": 876.380859375, - "completions/mean_terminated_length": 843.4437255859375, - "completions/min_length": 101.0, - "completions/min_terminated_length": 101.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1105.525390625, + "completions/mean_terminated_length": 1055.1048583984375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, "epoch": 0.8493641717163096, - "grad_norm": 2.1319403648376465, - "kl": 6.3046875, - "learning_rate": 1.6096043076967592e-07, - "loss": 0.3498, - "num_tokens": 1333679404.0, - "reward": 1.87890625, - "reward_std": 0.5440197587013245, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.1876426339149475, + "grad_norm": 2.0782597064971924, + "kl": 2.49609375, + "learning_rate": 1.6100561703530475e-07, + "loss": 0.1267, + "num_tokens": 1416821021.0, + "reward": 1.0869140625, + "reward_std": 0.37439000606536865, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.19242700934410095, "step": 2488 }, { @@ -72167,27 +72167,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 786.470703125, - "completions/mean_terminated_length": 761.3406372070312, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1086.412109375, + "completions/mean_terminated_length": 998.24951171875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, "epoch": 0.8497055560297004, - "grad_norm": 1.8529338836669922, - "kl": 6.3359375, - "learning_rate": 1.6069127131853846e-07, - "loss": 0.3665, - "num_tokens": 1334159837.0, - "reward": 1.91650390625, - "reward_std": 0.5342533588409424, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.18481481075286865, + "grad_norm": 2.828303813934326, + "kl": 2.9375, + "learning_rate": 1.6073626293110485e-07, + "loss": 0.192, + "num_tokens": 1417455024.0, + "reward": 1.1123046875, + "reward_std": 0.39791157841682434, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.8896484375, + "rewards/tag_count_reward/std": 0.22671453654766083, "step": 2489 }, { @@ -72196,27 +72196,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 891.326171875, - "completions/mean_terminated_length": 858.8092041015625, - "completions/min_length": 191.0, - "completions/min_terminated_length": 191.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1160.169921875, + "completions/mean_terminated_length": 1118.4110107421875, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.8500469403430913, - "grad_norm": 1.5846630334854126, - "kl": 6.0703125, - "learning_rate": 1.6042266441993583e-07, - "loss": 0.3741, - "num_tokens": 1334687492.0, - "reward": 1.87841796875, - "reward_std": 0.49128904938697815, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.17150279879570007, + "grad_norm": 2.182011127471924, + "kl": 1.7607421875, + "learning_rate": 1.6046746173484905e-07, + "loss": 0.069, + "num_tokens": 1418120327.0, + "reward": 1.1328125, + "reward_std": 0.34036684036254883, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.935546875, + "rewards/tag_count_reward/std": 0.17060412466526031, "step": 2490 }, { @@ -72225,27 +72225,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 762.4375, - "completions/mean_terminated_length": 731.5840454101562, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1039.5859375, + "completions/mean_terminated_length": 951.8047485351562, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, "epoch": 0.8503883246564821, - "grad_norm": 1.1558104753494263, - "kl": 3.9140625, - "learning_rate": 1.6015461045510626e-07, - "loss": 0.2275, - "num_tokens": 1335158852.0, - "reward": 1.95751953125, - "reward_std": 0.5034546852111816, - "rewards/accuracy_reward/mean": 0.138671875, - "rewards/accuracy_reward/std": 0.34594178199768066, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.94580078125, - "rewards/tag_count_reward/std": 0.15832088887691498, + "grad_norm": 1.5009369850158691, + "kl": 2.94140625, + "learning_rate": 1.6019921382834104e-07, + "loss": 0.166, + "num_tokens": 1418733587.0, + "reward": 1.16943359375, + "reward_std": 0.3654649555683136, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.39980348944664, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.90576171875, + "rewards/tag_count_reward/std": 0.20815329253673553, "step": 2491 }, { @@ -72254,27 +72254,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 849.712890625, - "completions/mean_terminated_length": 816.0260620117188, - "completions/min_length": 197.0, - "completions/min_terminated_length": 197.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1104.71484375, + "completions/mean_terminated_length": 1031.23779296875, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, "epoch": 0.8507297089698729, - "grad_norm": 1.2182904481887817, - "kl": 5.984375, - "learning_rate": 1.598871098045036e-07, - "loss": 0.3652, - "num_tokens": 1335678353.0, - "reward": 1.88720703125, - "reward_std": 0.6029765009880066, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.19859671592712402, + "grad_norm": 5.200196743011475, + "kl": 2.265625, + "learning_rate": 1.5993151959259855e-07, + "loss": 0.152, + "num_tokens": 1419383649.0, + "reward": 1.095703125, + "reward_std": 0.38947418332099915, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.20036010444164276, "step": 2492 }, { @@ -72283,27 +72283,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1916.0, - "completions/mean_length": 805.84765625, - "completions/mean_terminated_length": 760.5870361328125, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1056.689453125, + "completions/mean_terminated_length": 1003.6563720703125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.8510710932832636, - "grad_norm": 1.2753212451934814, - "kl": 6.203125, - "learning_rate": 1.5962016284779578e-07, - "loss": 0.3772, - "num_tokens": 1336165315.0, - "reward": 1.7998046875, - "reward_std": 0.4820001721382141, - "rewards/accuracy_reward/mean": 0.013671875, - "rewards/accuracy_reward/std": 0.1162383034825325, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.20712071657180786, + "grad_norm": 2.5590600967407227, + "kl": 2.568359375, + "learning_rate": 1.5966437940785281e-07, + "loss": 0.1067, + "num_tokens": 1419999042.0, + "reward": 1.013671875, + "reward_std": 0.3545224070549011, + "rewards/accuracy_reward/mean": 0.033203125, + "rewards/accuracy_reward/std": 0.17934183776378632, + "rewards/format_reward/mean": 0.07421875, + "rewards/format_reward/std": 0.2623828947544098, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.20866736769676208, "step": 2493 }, { @@ -72312,27 +72312,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 807.736328125, - "completions/mean_terminated_length": 783.0299072265625, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1103.6953125, + "completions/mean_terminated_length": 1021.4947509765625, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, "epoch": 0.8514124775966544, - "grad_norm": 0.8199345469474792, - "kl": 5.203125, - "learning_rate": 1.5935376996386552e-07, - "loss": 0.3086, - "num_tokens": 1336655564.0, - "reward": 1.90576171875, - "reward_std": 0.5557575225830078, - "rewards/accuracy_reward/mean": 0.1270161271095276, - "rewards/accuracy_reward/std": 0.33332720398902893, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19323164224624634, + "grad_norm": 2.4877443313598633, + "kl": 2.953125, + "learning_rate": 1.5939779365354836e-07, + "loss": 0.1993, + "num_tokens": 1420640822.0, + "reward": 1.12744140625, + "reward_std": 0.40630829334259033, + "rewards/accuracy_reward/mean": 0.16935484111309052, + "rewards/accuracy_reward/std": 0.3754436671733856, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.89501953125, + "rewards/tag_count_reward/std": 0.22467544674873352, "step": 2494 }, { @@ -72341,27 +72341,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 800.568359375, - "completions/mean_terminated_length": 736.5318603515625, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 995.775390625, + "completions/mean_terminated_length": 950.7719116210938, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.8517538619100452, - "grad_norm": 1.5452642440795898, - "kl": 7.2421875, - "learning_rate": 1.590879315308086e-07, - "loss": 0.4371, - "num_tokens": 1337137519.0, - "reward": 1.8212890625, - "reward_std": 0.6151126623153687, - "rewards/accuracy_reward/mean": 0.0927419364452362, - "rewards/accuracy_reward/std": 0.2903633117675781, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.226512148976326, + "grad_norm": 4.02091646194458, + "kl": 1.970703125, + "learning_rate": 1.591317627083419e-07, + "loss": 0.1157, + "num_tokens": 1421222723.0, + "reward": 1.12060546875, + "reward_std": 0.36581292748451233, + "rewards/accuracy_reward/mean": 0.15120968222618103, + "rewards/accuracy_reward/std": 0.35861483216285706, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.1965412050485611, "step": 2495 }, { @@ -72370,27 +72370,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1915.0, - "completions/mean_length": 835.091796875, - "completions/mean_terminated_length": 785.7865600585938, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1100.44921875, + "completions/mean_terminated_length": 1011.36328125, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, "epoch": 0.852095246223436, - "grad_norm": 1.4080651998519897, - "kl": 5.32421875, - "learning_rate": 1.5882264792593397e-07, - "loss": 0.3321, - "num_tokens": 1337647918.0, - "reward": 1.91796875, - "reward_std": 0.4931030571460724, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.17485272884368896, + "grad_norm": 2.572394609451294, + "kl": 2.708984375, + "learning_rate": 1.5886628695010224e-07, + "loss": 0.1293, + "num_tokens": 1421868985.0, + "reward": 1.09521484375, + "reward_std": 0.38914650678634644, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.89599609375, + "rewards/tag_count_reward/std": 0.22184641659259796, "step": 2496 }, { @@ -72399,27 +72399,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 870.51171875, - "completions/mean_terminated_length": 830.07275390625, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1073.783203125, + "completions/mean_terminated_length": 1013.1473388671875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.8524366305368268, - "grad_norm": 2.8357620239257812, - "kl": 5.578125, - "learning_rate": 1.5855791952576342e-07, - "loss": 0.3538, - "num_tokens": 1338170036.0, - "reward": 1.837890625, - "reward_std": 0.5277907252311707, - "rewards/accuracy_reward/mean": 0.060483869165182114, - "rewards/accuracy_reward/std": 0.2386218160390854, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.19549374282360077, + "grad_norm": 4.3175578117370605, + "kl": 2.671875, + "learning_rate": 1.586013667559096e-07, + "loss": 0.1764, + "num_tokens": 1422495178.0, + "reward": 1.04638671875, + "reward_std": 0.3526151180267334, + "rewards/accuracy_reward/mean": 0.06854838877916336, + "rewards/accuracy_reward/std": 0.25293970108032227, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.21135582029819489, "step": 2497 }, { @@ -72428,27 +72428,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1882.0, - "completions/mean_length": 854.41015625, - "completions/mean_terminated_length": 800.8203735351562, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1123.138671875, + "completions/mean_terminated_length": 1040.491455078125, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, "epoch": 0.8527780148502176, - "grad_norm": 1.2506555318832397, - "kl": 6.71875, - "learning_rate": 1.582937467060302e-07, - "loss": 0.4093, - "num_tokens": 1338696598.0, - "reward": 1.77880859375, - "reward_std": 0.5462380647659302, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.20386749505996704, + "grad_norm": 3.066497325897217, + "kl": 2.97265625, + "learning_rate": 1.5833700250205528e-07, + "loss": 0.1977, + "num_tokens": 1423159329.0, + "reward": 1.01953125, + "reward_std": 0.3580136299133301, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.892578125, + "rewards/tag_count_reward/std": 0.2265058159828186, "step": 2498 }, { @@ -72457,27 +72457,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 906.654296875, - "completions/mean_terminated_length": 828.0230102539062, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1142.677734375, + "completions/mean_terminated_length": 1051.172119140625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.8531193991636085, - "grad_norm": 2.58473539352417, - "kl": 7.6328125, - "learning_rate": 1.5803012984167963e-07, - "loss": 0.4522, - "num_tokens": 1339242885.0, - "reward": 1.80859375, - "reward_std": 0.5806085467338562, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.904296875, - "rewards/tag_count_reward/std": 0.2181157022714615, + "grad_norm": 1.9884110689163208, + "kl": 2.751953125, + "learning_rate": 1.5807319456404054e-07, + "loss": 0.1674, + "num_tokens": 1423826460.0, + "reward": 1.0625, + "reward_std": 0.40620261430740356, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.22742362320423126, "step": 2499 }, { @@ -72488,25 +72488,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1982.0, - "completions/mean_length": 931.302734375, - "completions/mean_terminated_length": 864.2546997070312, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1155.431640625, + "completions/mean_terminated_length": 1101.840576171875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.8534607834769993, - "grad_norm": 1.5278397798538208, - "kl": 7.0703125, - "learning_rate": 1.5776706930686738e-07, - "loss": 0.4381, - "num_tokens": 1339805136.0, - "reward": 1.8017578125, - "reward_std": 0.5659550428390503, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.83203125, - "rewards/format_reward/std": 0.374204158782959, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.21028900146484375, + "grad_norm": 2.617475986480713, + "kl": 2.330078125, + "learning_rate": 1.5780994331657667e-07, + "loss": 0.1171, + "num_tokens": 1424503465.0, + "reward": 1.09716796875, + "reward_std": 0.4116724729537964, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.2119247019290924, "step": 2500 }, { @@ -72515,27 +72515,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1789.0, - "completions/mean_length": 842.298828125, - "completions/mean_terminated_length": 805.909423828125, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1133.24609375, + "completions/mean_terminated_length": 1053.617919921875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.85380216779039, - "grad_norm": 2.8222107887268066, - "kl": 5.94140625, - "learning_rate": 1.5750456547495995e-07, - "loss": 0.4095, - "num_tokens": 1340313097.0, - "reward": 1.8447265625, - "reward_std": 0.4956468939781189, - "rewards/accuracy_reward/mean": 0.032258063554763794, - "rewards/accuracy_reward/std": 0.17686307430267334, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.18339595198631287, + "grad_norm": 5.500685214996338, + "kl": 2.76171875, + "learning_rate": 1.5754724913358417e-07, + "loss": 0.1886, + "num_tokens": 1425160391.0, + "reward": 1.0791015625, + "reward_std": 0.4148721694946289, + "rewards/accuracy_reward/mean": 0.09879032522439957, + "rewards/accuracy_reward/std": 0.2986815273761749, + "rewards/format_reward/mean": 0.076171875, + "rewards/format_reward/std": 0.26553234457969666, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.2143038660287857, "step": 2501 }, { @@ -72544,27 +72544,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1976.0, - "completions/mean_length": 848.896484375, - "completions/mean_terminated_length": 787.3408813476562, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1101.98828125, + "completions/mean_terminated_length": 1026.1475830078125, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, "epoch": 0.8541435521037808, - "grad_norm": 1.4963473081588745, - "kl": 6.296875, - "learning_rate": 1.572426187185334e-07, - "loss": 0.3906, - "num_tokens": 1340827364.0, - "reward": 1.82080078125, - "reward_std": 0.5283799171447754, - "rewards/accuracy_reward/mean": 0.0463709682226181, - "rewards/accuracy_reward/std": 0.21049949526786804, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.1984763890504837, + "grad_norm": 1.9453545808792114, + "kl": 2.6953125, + "learning_rate": 1.5728511238819235e-07, + "loss": 0.1576, + "num_tokens": 1425804241.0, + "reward": 1.07666015625, + "reward_std": 0.3485608696937561, + "rewards/accuracy_reward/mean": 0.10080645233392715, + "rewards/accuracy_reward/std": 0.30137622356414795, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.91455078125, + "rewards/tag_count_reward/std": 0.19383399188518524, "step": 2502 }, { @@ -72573,27 +72573,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1818.0, - "completions/mean_length": 784.9765625, - "completions/mean_terminated_length": 738.9555053710938, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1058.607421875, + "completions/mean_terminated_length": 979.2890014648438, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, "epoch": 0.8544849364171716, - "grad_norm": 2.6061666011810303, - "kl": 5.6484375, - "learning_rate": 1.5698122940937325e-07, - "loss": 0.3959, - "num_tokens": 1341301432.0, - "reward": 1.8818359375, - "reward_std": 0.49414151906967163, - "rewards/accuracy_reward/mean": 0.06854838877916336, - "rewards/accuracy_reward/std": 0.25293970108032227, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.18604448437690735, + "grad_norm": 8.42778205871582, + "kl": 3.2578125, + "learning_rate": 1.5702353345273876e-07, + "loss": 0.2364, + "num_tokens": 1426418408.0, + "reward": 1.10302734375, + "reward_std": 0.4140213429927826, + "rewards/accuracy_reward/mean": 0.1391129046678543, + "rewards/accuracy_reward/std": 0.34641367197036743, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.2084331214427948, "step": 2503 }, { @@ -72602,27 +72602,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 821.587890625, - "completions/mean_terminated_length": 789.6372680664062, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1060.236328125, + "completions/mean_terminated_length": 1022.1683349609375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.8548263207305624, - "grad_norm": 1.573183298110962, - "kl": 5.01953125, - "learning_rate": 1.5672039791847385e-07, - "loss": 0.2964, - "num_tokens": 1341791429.0, - "reward": 1.83984375, - "reward_std": 0.4401506185531616, - "rewards/accuracy_reward/mean": 0.015625, - "rewards/accuracy_reward/std": 0.12414088100194931, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.17199864983558655, + "grad_norm": 6.326718330383301, + "kl": 1.9765625, + "learning_rate": 1.567625126987686e-07, + "loss": 0.0499, + "num_tokens": 1427030593.0, + "reward": 1.06591796875, + "reward_std": 0.36252841353416443, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.1843547374010086, + "rewards/format_reward/mean": 0.095703125, + "rewards/format_reward/std": 0.2944713830947876, + "rewards/tag_count_reward/mean": 0.93505859375, + "rewards/tag_count_reward/std": 0.16494794189929962, "step": 2504 }, { @@ -72631,27 +72631,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 841.2109375, - "completions/mean_terminated_length": 797.2388916015625, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1082.912109375, + "completions/mean_terminated_length": 1031.2818603515625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, "epoch": 0.8551677050439532, - "grad_norm": 2.2540669441223145, - "kl": 6.12890625, - "learning_rate": 1.5646012461603773e-07, - "loss": 0.4215, - "num_tokens": 1342291985.0, - "reward": 1.87255859375, - "reward_std": 0.5276371240615845, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.18583056330680847, + "grad_norm": 3.129791736602783, + "kl": 1.93359375, + "learning_rate": 1.5650205049703417e-07, + "loss": 0.1046, + "num_tokens": 1427654900.0, + "reward": 1.1533203125, + "reward_std": 0.37939101457595825, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.16662302613258362, "step": 2505 }, { @@ -72660,27 +72660,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 825.771484375, - "completions/mean_terminated_length": 798.9360961914062, - "completions/min_length": 78.0, - "completions/min_terminated_length": 78.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1122.0703125, + "completions/mean_terminated_length": 1070.5238037109375, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, "epoch": 0.855509089357344, - "grad_norm": 1.54689621925354, - "kl": 4.474609375, - "learning_rate": 1.5620040987147536e-07, - "loss": 0.279, - "num_tokens": 1342792396.0, - "reward": 1.91552734375, - "reward_std": 0.4751802086830139, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.900390625, - "rewards/format_reward/std": 0.29977133870124817, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.1825571358203888, + "grad_norm": 4.0424723625183105, + "kl": 2.6953125, + "learning_rate": 1.5624214721749454e-07, + "loss": 0.1543, + "num_tokens": 1428307016.0, + "reward": 1.09765625, + "reward_std": 0.4202662706375122, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.0859375, + "rewards/format_reward/std": 0.28054583072662354, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.19499453902244568, "step": 2506 }, { @@ -72689,27 +72689,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 796.3828125, - "completions/mean_terminated_length": 776.5159301757812, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1092.7265625, + "completions/mean_terminated_length": 1047.7955322265625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, "epoch": 0.8558504736707349, - "grad_norm": 1.6224844455718994, - "kl": 4.1015625, - "learning_rate": 1.559412540534043e-07, - "loss": 0.2462, - "num_tokens": 1343279280.0, - "reward": 1.900390625, - "reward_std": 0.4144550859928131, - "rewards/accuracy_reward/mean": 0.03427419438958168, - "rewards/accuracy_reward/std": 0.18211629986763, - "rewards/format_reward/mean": 0.91015625, - "rewards/format_reward/std": 0.2862374484539032, - "rewards/tag_count_reward/mean": 0.95703125, - "rewards/tag_count_reward/std": 0.15438589453697205, + "grad_norm": 5.255420207977295, + "kl": 2.375, + "learning_rate": 1.559828032293147e-07, + "loss": 0.1522, + "num_tokens": 1428945628.0, + "reward": 1.072265625, + "reward_std": 0.3573336899280548, + "rewards/accuracy_reward/mean": 0.060483869165182114, + "rewards/accuracy_reward/std": 0.23862183094024658, + "rewards/format_reward/mean": 0.080078125, + "rewards/format_reward/std": 0.271679550409317, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.17621363699436188, "step": 2507 }, { @@ -72718,27 +72718,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 780.849609375, - "completions/mean_terminated_length": 753.0279541015625, - "completions/min_length": 81.0, - "completions/min_terminated_length": 81.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1042.43359375, + "completions/mean_terminated_length": 977.6257934570312, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.8561918579841257, - "grad_norm": 0.9053124785423279, - "kl": 4.8203125, - "learning_rate": 1.5568265752964865e-07, - "loss": 0.2992, - "num_tokens": 1343756947.0, - "reward": 1.9140625, - "reward_std": 0.4191058874130249, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.912109375, - "rewards/format_reward/std": 0.2834126651287079, - "rewards/tag_count_reward/mean": 0.951171875, - "rewards/tag_count_reward/std": 0.17078326642513275, + "grad_norm": 3.9764435291290283, + "kl": 2.91015625, + "learning_rate": 1.5572401890086537e-07, + "loss": 0.1586, + "num_tokens": 1429557226.0, + "reward": 1.0341796875, + "reward_std": 0.3454781174659729, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.9091796875, + "rewards/tag_count_reward/std": 0.19920024275779724, "step": 2508 }, { @@ -72747,27 +72747,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1950.0, - "completions/mean_length": 796.09375, - "completions/mean_terminated_length": 778.7406005859375, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1097.9609375, + "completions/mean_terminated_length": 1040.9193115234375, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, "epoch": 0.8565332422975165, - "grad_norm": 4.397088527679443, - "kl": 4.3828125, - "learning_rate": 1.5542462066723912e-07, - "loss": 0.2746, - "num_tokens": 1344241939.0, - "reward": 1.9208984375, - "reward_std": 0.38755735754966736, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.91796875, - "rewards/format_reward/std": 0.2746807038784027, - "rewards/tag_count_reward/mean": 0.9560546875, - "rewards/tag_count_reward/std": 0.1517105996608734, + "grad_norm": 8.94039249420166, + "kl": 2.96484375, + "learning_rate": 1.5546579459972237e-07, + "loss": 0.1281, + "num_tokens": 1430196774.0, + "reward": 1.10400390625, + "reward_std": 0.4110373258590698, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.115234375, + "rewards/format_reward/std": 0.3196168541908264, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.18835864961147308, "step": 2509 }, { @@ -72776,27 +72776,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 865.845703125, - "completions/mean_terminated_length": 830.1669921875, - "completions/min_length": 64.0, - "completions/min_terminated_length": 64.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1099.615234375, + "completions/mean_terminated_length": 1046.818603515625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, "epoch": 0.8568746266109072, - "grad_norm": 0.8987749814987183, - "kl": 5.4609375, - "learning_rate": 1.551671438324116e-07, - "loss": 0.3491, - "num_tokens": 1344768708.0, - "reward": 1.9482421875, - "reward_std": 0.46120744943618774, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.9462890625, - "rewards/tag_count_reward/std": 0.17288248240947723, + "grad_norm": 2.3975346088409424, + "kl": 2.46484375, + "learning_rate": 1.5520813069266605e-07, + "loss": 0.1358, + "num_tokens": 1430843233.0, + "reward": 1.126953125, + "reward_std": 0.3676784336566925, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.1835547834634781, "step": 2510 }, { @@ -72805,27 +72805,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1776.0, - "completions/mean_length": 824.564453125, - "completions/mean_terminated_length": 797.7025756835938, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1076.9296875, + "completions/mean_terminated_length": 1027.080078125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, "epoch": 0.857216010924298, - "grad_norm": 1.2078722715377808, - "kl": 5.28125, - "learning_rate": 1.549102273906076e-07, - "loss": 0.3368, - "num_tokens": 1345279525.0, - "reward": 1.91015625, - "reward_std": 0.43426308035850525, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.908203125, - "rewards/format_reward/std": 0.289021372795105, - "rewards/tag_count_reward/mean": 0.951171875, - "rewards/tag_count_reward/std": 0.16271483898162842, + "grad_norm": 3.456537961959839, + "kl": 3.11328125, + "learning_rate": 1.549510275456805e-07, + "loss": 0.1786, + "num_tokens": 1431483261.0, + "reward": 1.0869140625, + "reward_std": 0.41201263666152954, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.09765625, + "rewards/format_reward/std": 0.29713961482048035, + "rewards/tag_count_reward/mean": 0.9189453125, + "rewards/tag_count_reward/std": 0.1972721517086029, "step": 2511 }, { @@ -72834,27 +72834,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 835.6640625, - "completions/mean_terminated_length": 791.4898681640625, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1080.962890625, + "completions/mean_terminated_length": 1012.1777954101562, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, "epoch": 0.8575573952376888, - "grad_norm": 2.947654962539673, - "kl": 7.4765625, - "learning_rate": 1.5465387170647284e-07, - "loss": 0.4508, - "num_tokens": 1345780601.0, - "reward": 1.88427734375, - "reward_std": 0.47508326172828674, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.19169752299785614, + "grad_norm": 2.3917860984802246, + "kl": 3.98828125, + "learning_rate": 1.5469448552395384e-07, + "loss": 0.233, + "num_tokens": 1432109930.0, + "reward": 1.0576171875, + "reward_std": 0.40050607919692993, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.07421875, + "rewards/format_reward/std": 0.2623828947544098, + "rewards/tag_count_reward/mean": 0.9091796875, + "rewards/tag_count_reward/std": 0.19981329143047333, "step": 2512 }, { @@ -72863,27 +72863,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1844.0, - "completions/mean_length": 807.88671875, - "completions/mean_terminated_length": 773.0240478515625, - "completions/min_length": 62.0, - "completions/min_terminated_length": 62.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1098.43359375, + "completions/mean_terminated_length": 1045.5711669921875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, "epoch": 0.8578987795510796, - "grad_norm": 1.9713817834854126, - "kl": 5.75390625, - "learning_rate": 1.5439807714385747e-07, - "loss": 0.3325, - "num_tokens": 1346274911.0, - "reward": 1.93212890625, - "reward_std": 0.5016872882843018, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.94970703125, - "rewards/tag_count_reward/std": 0.16037173569202423, + "grad_norm": 13.534027099609375, + "kl": 3.3984375, + "learning_rate": 1.5443850499187656e-07, + "loss": 0.1254, + "num_tokens": 1432753000.0, + "reward": 1.18310546875, + "reward_std": 0.4591599106788635, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, + "rewards/format_reward/mean": 0.09765625, + "rewards/format_reward/std": 0.29713961482048035, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.1891029179096222, "step": 2513 }, { @@ -72892,27 +72892,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 832.283203125, - "completions/mean_terminated_length": 805.5908203125, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1149.34375, + "completions/mean_terminated_length": 1101.2674560546875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.8582401638644704, - "grad_norm": 1.8401319980621338, - "kl": 6.2265625, - "learning_rate": 1.54142844065815e-07, - "loss": 0.3435, - "num_tokens": 1346782384.0, - "reward": 1.9248046875, - "reward_std": 0.42352402210235596, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.90625, - "rewards/format_reward/std": 0.29176566004753113, - "rewards/tag_count_reward/mean": 0.9560546875, - "rewards/tag_count_reward/std": 0.14679372310638428, + "grad_norm": 5.248220443725586, + "kl": 3.78515625, + "learning_rate": 1.54183086313042e-07, + "loss": 0.195, + "num_tokens": 1433422808.0, + "reward": 1.1142578125, + "reward_std": 0.4387296140193939, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.30236753821372986, + "rewards/tag_count_reward/mean": 0.9091796875, + "rewards/tag_count_reward/std": 0.2058434784412384, "step": 2514 }, { @@ -72921,27 +72921,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1917.0, - "completions/mean_length": 768.3671875, - "completions/mean_terminated_length": 745.4711303710938, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1072.4453125, + "completions/mean_terminated_length": 1011.7261962890625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.8585815481778613, - "grad_norm": 1.9592394828796387, - "kl": 6.00390625, - "learning_rate": 1.5388817283460205e-07, - "loss": 0.359, - "num_tokens": 1347251980.0, - "reward": 1.92138671875, - "reward_std": 0.42750340700149536, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.90234375, - "rewards/format_reward/std": 0.29713961482048035, - "rewards/tag_count_reward/mean": 0.94677734375, - "rewards/tag_count_reward/std": 0.1719701737165451, + "grad_norm": 6.2392578125, + "kl": 3.21875, + "learning_rate": 1.539282298502454e-07, + "loss": 0.1547, + "num_tokens": 1434048092.0, + "reward": 1.08447265625, + "reward_std": 0.3809046745300293, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0859375, + "rewards/format_reward/std": 0.28054583072662354, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.20702555775642395, "step": 2515 }, { @@ -72950,27 +72950,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1955.0, - "completions/mean_length": 821.07421875, - "completions/mean_terminated_length": 794.1357421875, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1115.447265625, + "completions/mean_terminated_length": 1049.114990234375, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, "epoch": 0.8589229324912521, - "grad_norm": 2.8998708724975586, - "kl": 5.7265625, - "learning_rate": 1.5363406381167798e-07, - "loss": 0.3285, - "num_tokens": 1347750978.0, - "reward": 1.88525390625, - "reward_std": 0.3968276083469391, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.94775390625, - "rewards/tag_count_reward/std": 0.17155851423740387, + "grad_norm": 4.1004252433776855, + "kl": 3.26953125, + "learning_rate": 1.5367393596548355e-07, + "loss": 0.1713, + "num_tokens": 1434697809.0, + "reward": 1.033203125, + "reward_std": 0.4006016254425049, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.2214287668466568, "step": 2516 }, { @@ -72979,27 +72979,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1952.0, - "completions/mean_length": 799.5859375, - "completions/mean_terminated_length": 754.09716796875, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1120.7109375, + "completions/mean_terminated_length": 1046.3712158203125, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, "epoch": 0.8592643168046429, - "grad_norm": 0.846878170967102, - "kl": 6.20703125, - "learning_rate": 1.533805173577039e-07, - "loss": 0.4029, - "num_tokens": 1348235790.0, - "reward": 1.9208984375, - "reward_std": 0.46979883313179016, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.90234375, - "rewards/format_reward/std": 0.29713961482048035, - "rewards/tag_count_reward/mean": 0.9482421875, - "rewards/tag_count_reward/std": 0.170634925365448, + "grad_norm": 5.666050910949707, + "kl": 3.08203125, + "learning_rate": 1.5342020501995375e-07, + "loss": 0.168, + "num_tokens": 1435347037.0, + "reward": 1.08447265625, + "reward_std": 0.38367778062820435, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.083984375, + "rewards/format_reward/std": 0.2776356339454651, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.21408694982528687, "step": 2517 }, { @@ -73008,27 +73008,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 846.21875, - "completions/mean_terminated_length": 804.9454956054688, - "completions/min_length": 71.0, - "completions/min_terminated_length": 71.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1129.193359375, + "completions/mean_terminated_length": 1065.8935546875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.8596057011180336, - "grad_norm": 2.724597215652466, - "kl": 6.79296875, - "learning_rate": 1.531275338325429e-07, - "loss": 0.3978, - "num_tokens": 1348749358.0, - "reward": 1.90380859375, - "reward_std": 0.5706411600112915, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19207598268985748, + "grad_norm": 7.628841400146484, + "kl": 3.21484375, + "learning_rate": 1.5316703737405416e-07, + "loss": 0.1428, + "num_tokens": 1436005488.0, + "reward": 1.1533203125, + "reward_std": 0.3962182402610779, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.21027082204818726, "step": 2518 }, { @@ -73037,27 +73037,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1873.0, - "completions/mean_length": 785.5625, - "completions/mean_terminated_length": 755.2640380859375, - "completions/min_length": 68.0, - "completions/min_terminated_length": 68.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1042.45703125, + "completions/mean_terminated_length": 984.2850952148438, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, "epoch": 0.8599470854314244, - "grad_norm": 2.1348705291748047, - "kl": 5.8515625, - "learning_rate": 1.528751135952585e-07, - "loss": 0.3575, - "num_tokens": 1349235422.0, - "reward": 1.923828125, - "reward_std": 0.4424586892127991, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.17754259705543518, + "grad_norm": 4.38917875289917, + "kl": 3.01953125, + "learning_rate": 1.5291443338738242e-07, + "loss": 0.1765, + "num_tokens": 1436623082.0, + "reward": 1.07470703125, + "reward_std": 0.35303807258605957, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.90673828125, + "rewards/tag_count_reward/std": 0.20564086735248566, "step": 2519 }, { @@ -73066,27 +73066,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 731.671875, - "completions/mean_terminated_length": 710.77783203125, - "completions/min_length": 184.0, - "completions/min_terminated_length": 184.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1063.9921875, + "completions/mean_terminated_length": 989.5714721679688, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.8602884697448152, - "grad_norm": 1.0865064859390259, - "kl": 3.98046875, - "learning_rate": 1.5262325700411534e-07, - "loss": 0.2623, - "num_tokens": 1349688662.0, - "reward": 1.99267578125, - "reward_std": 0.38150593638420105, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.923828125, - "rewards/format_reward/std": 0.26553234457969666, - "rewards/tag_count_reward/mean": 0.96337890625, - "rewards/tag_count_reward/std": 0.13454900681972504, + "grad_norm": 7.5608229637146, + "kl": 3.39453125, + "learning_rate": 1.526623934187359e-07, + "loss": 0.1604, + "num_tokens": 1437246470.0, + "reward": 1.07275390625, + "reward_std": 0.3858228325843811, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.89892578125, + "rewards/tag_count_reward/std": 0.2259223759174347, "step": 2520 }, { @@ -73095,27 +73095,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1920.0, - "completions/mean_length": 770.08203125, - "completions/mean_terminated_length": 731.5130615234375, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1040.384765625, + "completions/mean_terminated_length": 982.0929565429688, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, "epoch": 0.860629854058206, - "grad_norm": 1.1588002443313599, - "kl": 6.4140625, - "learning_rate": 1.5237196441657767e-07, - "loss": 0.433, - "num_tokens": 1350163408.0, - "reward": 1.9150390625, - "reward_std": 0.5046082735061646, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.1709035038948059, + "grad_norm": 7.116761684417725, + "kl": 2.525390625, + "learning_rate": 1.524109178261106e-07, + "loss": 0.157, + "num_tokens": 1437859611.0, + "reward": 1.08837890625, + "reward_std": 0.3956470489501953, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.076171875, + "rewards/format_reward/std": 0.26553234457969666, + "rewards/tag_count_reward/mean": 0.90673828125, + "rewards/tag_count_reward/std": 0.20504523813724518, "step": 2521 }, { @@ -73124,27 +73124,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 832.359375, - "completions/mean_terminated_length": 777.779541015625, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1164.919921875, + "completions/mean_terminated_length": 1096.132568359375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, "epoch": 0.8609712383715968, - "grad_norm": 2.694650650024414, - "kl": 6.46875, - "learning_rate": 1.5212123618930924e-07, - "loss": 0.3736, - "num_tokens": 1350670904.0, - "reward": 1.84033203125, - "reward_std": 0.522647500038147, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.19244377315044403, + "grad_norm": 5.293074607849121, + "kl": 2.326171875, + "learning_rate": 1.521600069667012e-07, + "loss": 0.154, + "num_tokens": 1438537378.0, + "reward": 1.0576171875, + "reward_std": 0.3799906373023987, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.21314142644405365, "step": 2522 }, { @@ -73153,27 +73153,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 792.345703125, - "completions/mean_terminated_length": 757.046142578125, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1036.826171875, + "completions/mean_terminated_length": 969.4146118164062, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, "epoch": 0.8613126226849876, - "grad_norm": 1.2514175176620483, - "kl": 6.6015625, - "learning_rate": 1.518710726781731e-07, - "loss": 0.4233, - "num_tokens": 1351153129.0, - "reward": 1.90869140625, - "reward_std": 0.5649210810661316, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.1834864467382431, + "grad_norm": 4.500375747680664, + "kl": 2.8046875, + "learning_rate": 1.5190966119689977e-07, + "loss": 0.1897, + "num_tokens": 1439144777.0, + "reward": 1.16357421875, + "reward_std": 0.42020463943481445, + "rewards/accuracy_reward/mean": 0.212890625, + "rewards/accuracy_reward/std": 0.409751296043396, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.20753724873065948, "step": 2523 }, { @@ -73182,27 +73182,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1947.0, - "completions/mean_length": 790.642578125, - "completions/mean_terminated_length": 750.0826416015625, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1055.787109375, + "completions/mean_terminated_length": 994.0311889648438, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.8616540069983785, - "grad_norm": 1.3608920574188232, - "kl": 6.1328125, - "learning_rate": 1.5162147423823043e-07, - "loss": 0.3918, - "num_tokens": 1351638034.0, - "reward": 1.91455078125, - "reward_std": 0.577094316482544, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.19674043357372284, + "grad_norm": 4.57724666595459, + "kl": 1.884765625, + "learning_rate": 1.516598808722962e-07, + "loss": 0.1277, + "num_tokens": 1439765436.0, + "reward": 1.2265625, + "reward_std": 0.4196757674217224, + "rewards/accuracy_reward/mean": 0.244140625, + "rewards/accuracy_reward/std": 0.42999663949012756, + "rewards/format_reward/mean": 0.060546875, + "rewards/format_reward/std": 0.2387305200099945, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.19471992552280426, "step": 2524 }, { @@ -73211,27 +73211,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 821.892578125, - "completions/mean_terminated_length": 779.7838745117188, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1090.970703125, + "completions/mean_terminated_length": 1029.291015625, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, "epoch": 0.8619953913117693, - "grad_norm": 1.9353266954421997, - "kl": 5.9375, - "learning_rate": 1.5137244122374076e-07, - "loss": 0.362, - "num_tokens": 1352131995.0, - "reward": 1.853515625, - "reward_std": 0.5325428247451782, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.17965060472488403, + "grad_norm": 2.4420371055603027, + "kl": 2.30078125, + "learning_rate": 1.514106663476768e-07, + "loss": 0.1403, + "num_tokens": 1440397165.0, + "reward": 1.12158203125, + "reward_std": 0.4238749146461487, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.2020832896232605, "step": 2525 }, { @@ -73240,27 +73240,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.10546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 809.013671875, - "completions/mean_terminated_length": 766.462646484375, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1143.25390625, + "completions/mean_terminated_length": 1036.580810546875, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, "epoch": 0.86233677562516, - "grad_norm": 0.9738795161247253, - "kl": 6.203125, - "learning_rate": 1.5112397398816076e-07, - "loss": 0.407, - "num_tokens": 1352626498.0, - "reward": 1.83642578125, - "reward_std": 0.559461236000061, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.2022770643234253, + "grad_norm": 3.158919095993042, + "kl": 2.669921875, + "learning_rate": 1.5116201797702455e-07, + "loss": 0.1799, + "num_tokens": 1441062799.0, + "reward": 1.0625, + "reward_std": 0.4097965359687805, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.890625, + "rewards/tag_count_reward/std": 0.22718821465969086, "step": 2526 }, { @@ -73269,27 +73269,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1923.0, - "completions/mean_length": 767.6484375, - "completions/mean_terminated_length": 734.2926025390625, - "completions/min_length": 76.0, - "completions/min_terminated_length": 76.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1082.708984375, + "completions/mean_terminated_length": 1009.703857421875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.8626781599385508, - "grad_norm": 3.458853244781494, - "kl": 5.58984375, - "learning_rate": 1.508760728841442e-07, - "loss": 0.4151, - "num_tokens": 1353100606.0, - "reward": 1.82470703125, - "reward_std": 0.4834443926811218, - "rewards/accuracy_reward/mean": 0.026209676638245583, - "rewards/accuracy_reward/std": 0.1599196344614029, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.1906779557466507, + "grad_norm": 3.6557445526123047, + "kl": 2.439453125, + "learning_rate": 1.5091393611351817e-07, + "loss": 0.1372, + "num_tokens": 1441698218.0, + "reward": 1.0244140625, + "reward_std": 0.3351839780807495, + "rewards/accuracy_reward/mean": 0.05040322616696358, + "rewards/accuracy_reward/std": 0.21899642050266266, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.20610326528549194, "step": 2527 }, { @@ -73298,27 +73298,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1841.0, - "completions/mean_length": 797.880859375, - "completions/mean_terminated_length": 752.3299560546875, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1122.416015625, + "completions/mean_terminated_length": 1008.747802734375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.8630195442519416, - "grad_norm": 2.578061103820801, - "kl": 5.765625, - "learning_rate": 1.506287382635415e-07, - "loss": 0.3849, - "num_tokens": 1353604833.0, - "reward": 1.8681640625, - "reward_std": 0.5036935210227966, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.17590700089931488, + "grad_norm": 3.7935750484466553, + "kl": 3.46875, + "learning_rate": 1.5066642110953168e-07, + "loss": 0.243, + "num_tokens": 1442368607.0, + "reward": 1.01904296875, + "reward_std": 0.3598157465457916, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.88427734375, + "rewards/tag_count_reward/std": 0.23232397437095642, "step": 2528 }, { @@ -73327,27 +73327,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1767.0, - "completions/mean_length": 798.412109375, - "completions/mean_terminated_length": 747.6158447265625, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1102.177734375, + "completions/mean_terminated_length": 1013.2543334960938, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, "epoch": 0.8633609285653324, - "grad_norm": 3.326608419418335, - "kl": 4.640625, - "learning_rate": 1.503819704773987e-07, - "loss": 0.3471, - "num_tokens": 1354089796.0, - "reward": 1.90234375, - "reward_std": 0.4647481143474579, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.94921875, - "rewards/tag_count_reward/std": 0.16658000648021698, + "grad_norm": 2.2721869945526123, + "kl": 2.365234375, + "learning_rate": 1.5041947331663385e-07, + "loss": 0.1527, + "num_tokens": 1443009098.0, + "reward": 1.0830078125, + "reward_std": 0.37287774682044983, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.9091796875, + "rewards/tag_count_reward/std": 0.20702843368053436, "step": 2529 }, { @@ -73356,27 +73356,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1917.0, - "completions/mean_length": 775.318359375, - "completions/mean_terminated_length": 715.4580688476562, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1048.83984375, + "completions/mean_terminated_length": 991.0371704101562, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, "epoch": 0.8637023128787232, - "grad_norm": 1.401532769203186, - "kl": 7.0234375, - "learning_rate": 1.501357698759578e-07, - "loss": 0.4558, - "num_tokens": 1354558583.0, - "reward": 1.86279296875, - "reward_std": 0.5681871771812439, - "rewards/accuracy_reward/mean": 0.08467742055654526, - "rewards/accuracy_reward/std": 0.278682142496109, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19523389637470245, + "grad_norm": 10.322519302368164, + "kl": 2.703125, + "learning_rate": 1.5017309308558804e-07, + "loss": 0.2011, + "num_tokens": 1443617928.0, + "reward": 1.08935546875, + "reward_std": 0.3528033494949341, + "rewards/accuracy_reward/mean": 0.12298387289047241, + "rewards/accuracy_reward/std": 0.32875028252601624, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21157780289649963, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.19436074793338776, "step": 2530 }, { @@ -73385,27 +73385,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1914.0, - "completions/mean_length": 782.685546875, - "completions/mean_terminated_length": 741.8689575195312, - "completions/min_length": 94.0, - "completions/min_terminated_length": 94.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1045.048828125, + "completions/mean_terminated_length": 978.1854858398438, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, "epoch": 0.864043697192114, - "grad_norm": 2.5293610095977783, - "kl": 5.68359375, - "learning_rate": 1.498901368086553e-07, - "loss": 0.372, - "num_tokens": 1355034646.0, - "reward": 1.8740234375, - "reward_std": 0.5300194621086121, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.17864517867565155, + "grad_norm": 4.100895881652832, + "kl": 3.2421875, + "learning_rate": 1.499272807663511e-07, + "loss": 0.2239, + "num_tokens": 1444228321.0, + "reward": 1.07861328125, + "reward_std": 0.3856808543205261, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.21408694982528687, "step": 2531 }, { @@ -73414,27 +73414,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1889.0, - "completions/mean_length": 847.482421875, - "completions/mean_terminated_length": 791.016357421875, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1153.837890625, + "completions/mean_terminated_length": 1094.2271728515625, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, "epoch": 0.8643850815055049, - "grad_norm": 1.91965651512146, - "kl": 6.7578125, - "learning_rate": 1.4964507162412268e-07, - "loss": 0.4391, - "num_tokens": 1355549549.0, - "reward": 1.84375, - "reward_std": 0.5757134556770325, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.20512036979198456, + "grad_norm": 2.2413811683654785, + "kl": 2.34765625, + "learning_rate": 1.4968203670807367e-07, + "loss": 0.1116, + "num_tokens": 1444900078.0, + "reward": 1.11181640625, + "reward_std": 0.4081215262413025, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.07421875, + "rewards/format_reward/std": 0.2623828947544098, + "rewards/tag_count_reward/mean": 0.91455078125, + "rewards/tag_count_reward/std": 0.20004449784755707, "step": 2532 }, { @@ -73443,27 +73443,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1939.0, - "completions/mean_length": 799.568359375, - "completions/mean_terminated_length": 746.1731567382812, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1102.888671875, + "completions/mean_terminated_length": 1066.4644775390625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, "epoch": 0.8647264658188957, - "grad_norm": 1.4131958484649658, - "kl": 7.1328125, - "learning_rate": 1.4940057467018482e-07, - "loss": 0.4115, - "num_tokens": 1356031168.0, - "reward": 1.81689453125, - "reward_std": 0.5381094217300415, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.20823590457439423, + "grad_norm": 6.804876327514648, + "kl": 2.703125, + "learning_rate": 1.4943736125909862e-07, + "loss": 0.1238, + "num_tokens": 1445536997.0, + "reward": 1.13232421875, + "reward_std": 0.4266893267631531, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.087890625, + "rewards/format_reward/std": 0.2834126651287079, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.17700830101966858, "step": 2533 }, { @@ -73472,27 +73472,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1823.0, - "completions/mean_length": 772.65234375, - "completions/mean_terminated_length": 747.2470092773438, - "completions/min_length": 82.0, - "completions/min_terminated_length": 82.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1066.009765625, + "completions/mean_terminated_length": 1032.284912109375, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, "epoch": 0.8650678501322864, - "grad_norm": 1.1551711559295654, - "kl": 4.83203125, - "learning_rate": 1.491566462938605e-07, - "loss": 0.2803, - "num_tokens": 1356505134.0, - "reward": 1.9287109375, - "reward_std": 0.4677783250808716, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.9482421875, - "rewards/tag_count_reward/std": 0.170634925365448, + "grad_norm": 1.6600645780563354, + "kl": 1.92578125, + "learning_rate": 1.4919325476696197e-07, + "loss": 0.0957, + "num_tokens": 1446161162.0, + "reward": 1.13134765625, + "reward_std": 0.3937014639377594, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.076171875, + "rewards/format_reward/std": 0.26553234457969666, + "rewards/tag_count_reward/mean": 0.93798828125, + "rewards/tag_count_reward/std": 0.17677602171897888, "step": 2534 }, { @@ -73501,27 +73501,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 793.908203125, - "completions/mean_terminated_length": 763.81005859375, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1164.103515625, + "completions/mean_terminated_length": 1048.9822998046875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, "epoch": 0.8654092344456772, - "grad_norm": 1.8901922702789307, - "kl": 6.2421875, - "learning_rate": 1.489132868413617e-07, - "loss": 0.4336, - "num_tokens": 1356998495.0, - "reward": 1.86669921875, - "reward_std": 0.48085707426071167, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.94287109375, - "rewards/tag_count_reward/std": 0.17213678359985352, + "grad_norm": 2.229029893875122, + "kl": 4.27734375, + "learning_rate": 1.4894971757839084e-07, + "loss": 0.2655, + "num_tokens": 1446844063.0, + "reward": 1.03173828125, + "reward_std": 0.3998609781265259, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.87744140625, + "rewards/tag_count_reward/std": 0.24379560351371765, "step": 2535 }, { @@ -73530,27 +73530,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1860.0, - "completions/mean_length": 794.125, - "completions/mean_terminated_length": 745.8012084960938, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1134.6640625, + "completions/mean_terminated_length": 1071.7412109375, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, "epoch": 0.865750618759068, - "grad_norm": 0.9839769601821899, - "kl": 7.3359375, - "learning_rate": 1.4867049665809232e-07, - "loss": 0.4569, - "num_tokens": 1357482591.0, - "reward": 1.86962890625, - "reward_std": 0.4972394108772278, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.19630283117294312, + "grad_norm": 9.9767484664917, + "kl": 3.36328125, + "learning_rate": 1.487067500393041e-07, + "loss": 0.161, + "num_tokens": 1447502515.0, + "reward": 1.10888671875, + "reward_std": 0.4104750156402588, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29176566004753113, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.19384385645389557, "step": 2536 }, { @@ -73559,27 +73559,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 773.451171875, - "completions/mean_terminated_length": 737.6204833984375, - "completions/min_length": 114.0, - "completions/min_terminated_length": 114.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1109.14453125, + "completions/mean_terminated_length": 1060.94873046875, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, "epoch": 0.8660920030724588, - "grad_norm": 0.9966095685958862, - "kl": 6.2734375, - "learning_rate": 1.4842827608864886e-07, - "loss": 0.3642, - "num_tokens": 1357958662.0, - "reward": 1.9443359375, - "reward_std": 0.5586145520210266, - "rewards/accuracy_reward/mean": 0.1391129046678543, - "rewards/accuracy_reward/std": 0.3464137017726898, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.1820572316646576, + "grad_norm": 7.036647796630859, + "kl": 3.59765625, + "learning_rate": 1.4846435249481159e-07, + "loss": 0.1856, + "num_tokens": 1448150461.0, + "reward": 1.1728515625, + "reward_std": 0.4495765268802643, + "rewards/accuracy_reward/mean": 0.18145161867141724, + "rewards/accuracy_reward/std": 0.38578101992607117, + "rewards/format_reward/mean": 0.095703125, + "rewards/format_reward/std": 0.2944713830947876, + "rewards/tag_count_reward/mean": 0.9013671875, + "rewards/tag_count_reward/std": 0.20816993713378906, "step": 2537 }, { @@ -73588,27 +73588,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 715.365234375, - "completions/mean_terminated_length": 686.1057739257812, - "completions/min_length": 88.0, - "completions/min_terminated_length": 88.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 994.533203125, + "completions/mean_terminated_length": 935.8866577148438, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.8664333873858496, - "grad_norm": 1.615759015083313, - "kl": 7.0390625, - "learning_rate": 1.48186625476819e-07, - "loss": 0.4503, - "num_tokens": 1358404289.0, - "reward": 1.94287109375, - "reward_std": 0.5075457096099854, - "rewards/accuracy_reward/mean": 0.11491935700178146, - "rewards/accuracy_reward/std": 0.3192465901374817, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.18322588503360748, + "grad_norm": 7.680780410766602, + "kl": 4.0859375, + "learning_rate": 1.4822252528921314e-07, + "loss": 0.1971, + "num_tokens": 1448739022.0, + "reward": 1.1787109375, + "reward_std": 0.4314824342727661, + "rewards/accuracy_reward/mean": 0.19153225421905518, + "rewards/accuracy_reward/std": 0.3939041793346405, + "rewards/format_reward/mean": 0.080078125, + "rewards/format_reward/std": 0.271679550409317, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.19971762597560883, "step": 2538 }, { @@ -73617,27 +73617,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 803.2890625, - "completions/mean_terminated_length": 781.0178833007812, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1128.978515625, + "completions/mean_terminated_length": 1073.7991943359375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, "epoch": 0.8667747716992404, - "grad_norm": 2.0541937351226807, - "kl": 4.6796875, - "learning_rate": 1.4794554516558166e-07, - "loss": 0.3296, - "num_tokens": 1358893781.0, - "reward": 1.9755859375, - "reward_std": 0.4012356698513031, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.923828125, - "rewards/format_reward/std": 0.26553234457969666, - "rewards/tag_count_reward/mean": 0.9560546875, - "rewards/tag_count_reward/std": 0.16185224056243896, + "grad_norm": 9.54370403289795, + "kl": 3.220703125, + "learning_rate": 1.479812687659988e-07, + "loss": 0.1228, + "num_tokens": 1449395267.0, + "reward": 1.12109375, + "reward_std": 0.3701077103614807, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.0859375, + "rewards/format_reward/std": 0.28054583072662354, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.19234009087085724, "step": 2539 }, { @@ -73646,27 +73646,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 738.998046875, - "completions/mean_terminated_length": 704.8958129882812, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1034.73828125, + "completions/mean_terminated_length": 991.4012451171875, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, "epoch": 0.8671161560126313, - "grad_norm": 1.09844970703125, - "kl": 4.75, - "learning_rate": 1.477050354971061e-07, - "loss": 0.2915, - "num_tokens": 1359347220.0, - "reward": 1.9404296875, - "reward_std": 0.4506056308746338, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.9560546875, - "rewards/tag_count_reward/std": 0.14845077693462372, + "grad_norm": 11.23932933807373, + "kl": 2.908203125, + "learning_rate": 1.4774058326784793e-07, + "loss": 0.1192, + "num_tokens": 1450000125.0, + "reward": 1.134765625, + "reward_std": 0.4307591915130615, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.08984375, + "rewards/format_reward/std": 0.2862374484539032, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.19686728715896606, "step": 2540 }, { @@ -73675,27 +73675,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1884.0, - "completions/mean_length": 747.431640625, - "completions/mean_terminated_length": 726.7877197265625, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1073.53515625, + "completions/mean_terminated_length": 1033.9227294921875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.8674575403260221, - "grad_norm": 1.7966375350952148, - "kl": 4.92578125, - "learning_rate": 1.474650968127518e-07, - "loss": 0.3323, - "num_tokens": 1359811713.0, - "reward": 1.9609375, - "reward_std": 0.3912133574485779, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.923828125, - "rewards/format_reward/std": 0.26553234457969666, - "rewards/tag_count_reward/mean": 0.9609375, - "rewards/tag_count_reward/std": 0.14899368584156036, + "grad_norm": 1.7925382852554321, + "kl": 2.2734375, + "learning_rate": 1.4750046913662893e-07, + "loss": 0.1239, + "num_tokens": 1450631583.0, + "reward": 1.12158203125, + "reward_std": 0.35589835047721863, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.93408203125, + "rewards/tag_count_reward/std": 0.17535409331321716, "step": 2541 }, { @@ -73704,27 +73704,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1926.0, - "completions/mean_length": 819.41015625, - "completions/mean_terminated_length": 787.40283203125, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1096.3671875, + "completions/mean_terminated_length": 1043.3897705078125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, "epoch": 0.8677989246394128, - "grad_norm": 1.3843644857406616, - "kl": 4.751953125, - "learning_rate": 1.4722572945306812e-07, - "loss": 0.3132, - "num_tokens": 1360319683.0, - "reward": 1.91748046875, - "reward_std": 0.41918283700942993, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.95458984375, - "rewards/tag_count_reward/std": 0.1548733115196228, + "grad_norm": 4.390264511108398, + "kl": 2.271484375, + "learning_rate": 1.472609267133983e-07, + "loss": 0.1189, + "num_tokens": 1451281355.0, + "reward": 1.0712890625, + "reward_std": 0.3726848363876343, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.19748516380786896, "step": 2542 }, { @@ -73733,27 +73733,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 726.431640625, - "completions/mean_terminated_length": 697.4151611328125, - "completions/min_length": 38.0, - "completions/min_terminated_length": 38.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1116.59765625, + "completions/mean_terminated_length": 1022.4559326171875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.8681403089528036, - "grad_norm": 1.3754132986068726, - "kl": 5.34765625, - "learning_rate": 1.4698693375779296e-07, - "loss": 0.3411, - "num_tokens": 1360779072.0, - "reward": 1.96826171875, - "reward_std": 0.43035122752189636, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.912109375, - "rewards/format_reward/std": 0.2834126651287079, - "rewards/tag_count_reward/mean": 0.95263671875, - "rewards/tag_count_reward/std": 0.16352249681949615, + "grad_norm": 2.5176916122436523, + "kl": 3.55859375, + "learning_rate": 1.4702195633840086e-07, + "loss": 0.2086, + "num_tokens": 1451940509.0, + "reward": 1.10107421875, + "reward_std": 0.40442851185798645, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.89404296875, + "rewards/tag_count_reward/std": 0.2209184318780899, "step": 2543 }, { @@ -73762,27 +73762,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1945.0, - "completions/mean_length": 782.02734375, - "completions/mean_terminated_length": 751.64404296875, - "completions/min_length": 49.0, - "completions/min_terminated_length": 49.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 1114.931640625, + "completions/mean_terminated_length": 1058.908935546875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, "epoch": 0.8684816932661944, - "grad_norm": 0.908316969871521, - "kl": 4.83984375, - "learning_rate": 1.4674871006585326e-07, - "loss": 0.318, - "num_tokens": 1361261438.0, - "reward": 1.91162109375, - "reward_std": 0.4011594355106354, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.92578125, - "rewards/format_reward/std": 0.2623828947544098, - "rewards/tag_count_reward/mean": 0.95458984375, - "rewards/tag_count_reward/std": 0.16407670080661774, + "grad_norm": 4.884047508239746, + "kl": 3.15625, + "learning_rate": 1.4678355835106862e-07, + "loss": 0.1385, + "num_tokens": 1452593322.0, + "reward": 1.02197265625, + "reward_std": 0.38512545824050903, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.1843547374010086, + "rewards/format_reward/mean": 0.08203125, + "rewards/format_reward/std": 0.2746807038784027, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.2129412740468979, "step": 2544 }, { @@ -73791,27 +73791,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1882.0, - "completions/mean_length": 814.658203125, - "completions/mean_terminated_length": 774.8729858398438, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1166.640625, + "completions/mean_terminated_length": 1127.0693359375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, "epoch": 0.8688230775795852, - "grad_norm": 0.8029277324676514, - "kl": 6.55078125, - "learning_rate": 1.46511058715364e-07, - "loss": 0.4211, - "num_tokens": 1361749663.0, - "reward": 1.8876953125, - "reward_std": 0.39295393228530884, - "rewards/accuracy_reward/mean": 0.021484375, - "rewards/accuracy_reward/std": 0.14513419568538666, - "rewards/format_reward/mean": 0.9140625, - "rewards/format_reward/std": 0.28054583072662354, - "rewards/tag_count_reward/mean": 0.9521484375, - "rewards/tag_count_reward/std": 0.16524095833301544, + "grad_norm": 3.527371644973755, + "kl": 2.2578125, + "learning_rate": 1.4654573309002081e-07, + "loss": 0.0698, + "num_tokens": 1453261762.0, + "reward": 1.06982421875, + "reward_std": 0.3864043354988098, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29176566004753113, + "rewards/tag_count_reward/mean": 0.91552734375, + "rewards/tag_count_reward/std": 0.19363176822662354, "step": 2545 }, { @@ -73820,27 +73820,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1915.0, - "completions/mean_length": 778.298828125, - "completions/mean_terminated_length": 750.421142578125, - "completions/min_length": 179.0, - "completions/min_terminated_length": 179.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1136.7421875, + "completions/mean_terminated_length": 1057.4183349609375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.869164461892976, - "grad_norm": 1.4204837083816528, - "kl": 5.6171875, - "learning_rate": 1.4627398004362774e-07, - "loss": 0.3371, - "num_tokens": 1362234728.0, - "reward": 1.90966796875, - "reward_std": 0.410476416349411, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.912109375, - "rewards/format_reward/std": 0.2834126651287079, - "rewards/tag_count_reward/mean": 0.95458984375, - "rewards/tag_count_reward/std": 0.15566104650497437, + "grad_norm": 2.3094141483306885, + "kl": 2.703125, + "learning_rate": 1.4630848089306282e-07, + "loss": 0.1297, + "num_tokens": 1453930350.0, + "reward": 1.044921875, + "reward_std": 0.38123565912246704, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.20979996025562286, "step": 2546 }, { @@ -73849,27 +73849,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1956.0, - "completions/mean_length": 802.7109375, - "completions/mean_terminated_length": 767.7027587890625, - "completions/min_length": 66.0, - "completions/min_terminated_length": 66.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1115.732421875, + "completions/mean_terminated_length": 1067.874755859375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.8695058462063668, - "grad_norm": 1.8084684610366821, - "kl": 6.5, - "learning_rate": 1.4603747438713426e-07, - "loss": 0.4016, - "num_tokens": 1362725940.0, - "reward": 1.8994140625, - "reward_std": 0.38757991790771484, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.9140625, - "rewards/format_reward/std": 0.28054583072662354, - "rewards/tag_count_reward/mean": 0.9521484375, - "rewards/tag_count_reward/std": 0.16375388205051422, + "grad_norm": 2.7179768085479736, + "kl": 2.16015625, + "learning_rate": 1.4607180209718666e-07, + "loss": 0.1209, + "num_tokens": 1454581829.0, + "reward": 1.03466796875, + "reward_std": 0.3109338581562042, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.17769792675971985, "step": 2547 }, { @@ -73878,27 +73878,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 822.4453125, - "completions/mean_terminated_length": 785.4567260742188, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1133.720703125, + "completions/mean_terminated_length": 1032.5748291015625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.8698472305197577, - "grad_norm": 3.6708779335021973, - "kl": 7.1484375, - "learning_rate": 1.458015420815601e-07, - "loss": 0.4196, - "num_tokens": 1363225736.0, - "reward": 1.9091796875, - "reward_std": 0.446837842464447, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.1813209503889084, + "grad_norm": 5.016415119171143, + "kl": 3.55859375, + "learning_rate": 1.458356970385692e-07, + "loss": 0.2238, + "num_tokens": 1455240998.0, + "reward": 1.10107421875, + "reward_std": 0.420688658952713, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.89013671875, + "rewards/tag_count_reward/std": 0.22011244297027588, "step": 2548 }, { @@ -73907,27 +73907,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1769.0, - "completions/mean_length": 751.98046875, - "completions/mean_terminated_length": 723.5249633789062, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1105.921875, + "completions/mean_terminated_length": 1032.5389404296875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, "epoch": 0.8701886148331485, - "grad_norm": 1.6291117668151855, - "kl": 4.8203125, - "learning_rate": 1.4556618346176813e-07, - "loss": 0.2759, - "num_tokens": 1363694462.0, - "reward": 1.93994140625, - "reward_std": 0.40668895840644836, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.92578125, - "rewards/format_reward/std": 0.2623828947544098, - "rewards/tag_count_reward/mean": 0.95556640625, - "rewards/tag_count_reward/std": 0.15515686571598053, + "grad_norm": 7.587802410125732, + "kl": 2.48828125, + "learning_rate": 1.4560016605257285e-07, + "loss": 0.1559, + "num_tokens": 1455890942.0, + "reward": 1.068359375, + "reward_std": 0.4012216329574585, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.21373461186885834, "step": 2549 }, { @@ -73936,27 +73936,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1962.0, - "completions/mean_length": 850.955078125, - "completions/mean_terminated_length": 802.294677734375, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1185.232421875, + "completions/mean_terminated_length": 1112.1165771484375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, "epoch": 0.8705299991465392, - "grad_norm": 2.863211154937744, - "kl": 8.5390625, - "learning_rate": 1.453313988618067e-07, - "loss": 0.5055, - "num_tokens": 1364210135.0, - "reward": 1.8369140625, - "reward_std": 0.5257569551467896, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.20429649949073792, + "grad_norm": 1.9166008234024048, + "kl": 2.2734375, + "learning_rate": 1.4536520947374438e-07, + "loss": 0.1179, + "num_tokens": 1456577765.0, + "reward": 1.080078125, + "reward_std": 0.36289283633232117, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.20895110070705414, "step": 2550 }, { @@ -73965,27 +73965,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 764.357421875, - "completions/mean_terminated_length": 722.9495849609375, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1023.712890625, + "completions/mean_terminated_length": 979.904296875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.87087138345993, - "grad_norm": 1.5717759132385254, - "kl": 5.95703125, - "learning_rate": 1.4509718861490983e-07, - "loss": 0.386, - "num_tokens": 1364665998.0, - "reward": 1.96923828125, - "reward_std": 0.4320371448993683, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.91015625, - "rewards/format_reward/std": 0.2862374484539032, - "rewards/tag_count_reward/mean": 0.94970703125, - "rewards/tag_count_reward/std": 0.1692764312028885, + "grad_norm": 3.534048318862915, + "kl": 2.134765625, + "learning_rate": 1.4513082763581479e-07, + "loss": 0.0891, + "num_tokens": 1457166418.0, + "reward": 1.17626953125, + "reward_std": 0.4116772413253784, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3810062110424042, + "rewards/format_reward/mean": 0.07421875, + "rewards/format_reward/std": 0.2623828947544098, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.18120694160461426, "step": 2551 }, { @@ -73994,27 +73994,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 744.84765625, - "completions/mean_terminated_length": 710.8978271484375, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1052.859375, + "completions/mean_terminated_length": 993.1097412109375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, "epoch": 0.8712127677733208, - "grad_norm": 2.072315216064453, - "kl": 5.8515625, - "learning_rate": 1.4486355305349583e-07, - "loss": 0.3489, - "num_tokens": 1365130576.0, - "reward": 1.9111328125, - "reward_std": 0.4487929940223694, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.1833334118127823, + "grad_norm": 10.404422760009766, + "kl": 2.15625, + "learning_rate": 1.4489702087169846e-07, + "loss": 0.1479, + "num_tokens": 1457788698.0, + "reward": 1.11865234375, + "reward_std": 0.384998619556427, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.2008453756570816, "step": 2552 }, { @@ -74023,27 +74023,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 826.181640625, - "completions/mean_terminated_length": 791.8333129882812, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1178.447265625, + "completions/mean_terminated_length": 1098.722900390625, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, "epoch": 0.8715541520867116, - "grad_norm": 1.1610126495361328, - "kl": 3.9609375, - "learning_rate": 1.4463049250916792e-07, - "loss": 0.2513, - "num_tokens": 1365631373.0, - "reward": 1.91015625, - "reward_std": 0.35444962978363037, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.916015625, - "rewards/format_reward/std": 0.2776356339454651, - "rewards/tag_count_reward/mean": 0.9609375, - "rewards/tag_count_reward/std": 0.1396721750497818, + "grad_norm": 7.825228214263916, + "kl": 3.3125, + "learning_rate": 1.4466378951349356e-07, + "loss": 0.2117, + "num_tokens": 1458469855.0, + "reward": 1.0068359375, + "reward_std": 0.3696444630622864, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.8857421875, + "rewards/tag_count_reward/std": 0.22747193276882172, "step": 2553 }, { @@ -74052,27 +74052,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1865.0, - "completions/mean_length": 764.099609375, - "completions/mean_terminated_length": 730.6513061523438, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1123.00390625, + "completions/mean_terminated_length": 1031.6953125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.8718955364001024, - "grad_norm": 1.0609973669052124, - "kl": 4.712890625, - "learning_rate": 1.4439800731271267e-07, - "loss": 0.3242, - "num_tokens": 1366106272.0, - "reward": 1.92333984375, - "reward_std": 0.3831869959831238, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.95654296875, - "rewards/tag_count_reward/std": 0.1490057110786438, + "grad_norm": 4.3146443367004395, + "kl": 2.83203125, + "learning_rate": 1.4443113389248027e-07, + "loss": 0.1894, + "num_tokens": 1459128513.0, + "reward": 1.0380859375, + "reward_std": 0.33470016717910767, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.20968833565711975, "step": 2554 }, { @@ -74081,27 +74081,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.09765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 916.3125, - "completions/mean_terminated_length": 886.8296508789062, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1260.181640625, + "completions/mean_terminated_length": 1174.919921875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, "epoch": 0.8722369207134932, - "grad_norm": 1.3187130689620972, - "kl": 4.90234375, - "learning_rate": 1.4416609779410049e-07, - "loss": 0.3169, - "num_tokens": 1366657712.0, - "reward": 1.84130859375, - "reward_std": 0.4678717851638794, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.19038206338882446, + "grad_norm": 4.6336541175842285, + "kl": 2.39453125, + "learning_rate": 1.4419905433912138e-07, + "loss": 0.1336, + "num_tokens": 1459856014.0, + "reward": 1.01708984375, + "reward_std": 0.3544754981994629, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.89599609375, + "rewards/tag_count_reward/std": 0.21626292169094086, "step": 2555 }, { @@ -74110,27 +74110,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 765.8828125, - "completions/mean_terminated_length": 745.5317993164062, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1000.650390625, + "completions/mean_terminated_length": 953.62646484375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.872578305026884, - "grad_norm": 1.5510096549987793, - "kl": 4.05859375, - "learning_rate": 1.4393476428248442e-07, - "loss": 0.2521, - "num_tokens": 1367136164.0, - "reward": 1.9287109375, - "reward_std": 0.4307078719139099, - "rewards/accuracy_reward/mean": 0.07661290466785431, - "rewards/accuracy_reward/std": 0.2662447690963745, - "rewards/format_reward/mean": 0.91015625, - "rewards/format_reward/std": 0.2862374484539032, - "rewards/tag_count_reward/mean": 0.9443359375, - "rewards/tag_count_reward/std": 0.17226234078407288, + "grad_norm": 3.524115562438965, + "kl": 1.939453125, + "learning_rate": 1.439675511830612e-07, + "loss": 0.0962, + "num_tokens": 1460454667.0, + "reward": 1.08740234375, + "reward_std": 0.38455015420913696, + "rewards/accuracy_reward/mean": 0.12096773833036423, + "rewards/accuracy_reward/std": 0.32641899585723877, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.189572274684906, "step": 2556 }, { @@ -74139,27 +74139,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 786.2734375, - "completions/mean_terminated_length": 753.40283203125, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1041.2421875, + "completions/mean_terminated_length": 996.040771484375, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, "epoch": 0.8729196893402749, - "grad_norm": 2.504319906234741, - "kl": 4.68359375, - "learning_rate": 1.4370400710620017e-07, - "loss": 0.3464, - "num_tokens": 1367612320.0, - "reward": 1.9033203125, - "reward_std": 0.45469242334365845, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.900390625, - "rewards/format_reward/std": 0.29977133870124817, - "rewards/tag_count_reward/mean": 0.9482421875, - "rewards/tag_count_reward/std": 0.1625591367483139, + "grad_norm": 4.734194755554199, + "kl": 2.359375, + "learning_rate": 1.4373662475312574e-07, + "loss": 0.1604, + "num_tokens": 1461061367.0, + "reward": 1.0537109375, + "reward_std": 0.3407416343688965, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.18058165907859802, "step": 2557 }, { @@ -74168,27 +74168,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 785.9296875, - "completions/mean_terminated_length": 747.8389892578125, - "completions/min_length": 24.0, - "completions/min_terminated_length": 24.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1038.099609375, + "completions/mean_terminated_length": 1003.4161987304688, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.8732610736536656, - "grad_norm": 1.2066786289215088, - "kl": 5.4296875, - "learning_rate": 1.4347382659276529e-07, - "loss": 0.3174, - "num_tokens": 1368091916.0, - "reward": 1.86083984375, - "reward_std": 0.4825897514820099, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.18773873150348663, + "grad_norm": 2.0697131156921387, + "kl": 1.98046875, + "learning_rate": 1.4350627537732113e-07, + "loss": 0.1229, + "num_tokens": 1461670074.0, + "reward": 1.0712890625, + "reward_std": 0.33228811621665955, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.052734375, + "rewards/format_reward/std": 0.22372129559516907, + "rewards/tag_count_reward/mean": 0.9365234375, + "rewards/tag_count_reward/std": 0.1673554629087448, "step": 2558 }, { @@ -74197,27 +74197,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 869.80078125, - "completions/mean_terminated_length": 836.6787109375, - "completions/min_length": 62.0, - "completions/min_terminated_length": 62.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1127.173828125, + "completions/mean_terminated_length": 1077.9114990234375, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, "epoch": 0.8736024579670564, - "grad_norm": 1.2075902223587036, - "kl": 5.69140625, - "learning_rate": 1.4324422306887873e-07, - "loss": 0.3744, - "num_tokens": 1368614374.0, - "reward": 1.87451171875, - "reward_std": 0.5310301780700684, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.19419844448566437, + "grad_norm": 2.5313568115234375, + "kl": 2.23046875, + "learning_rate": 1.432765033828347e-07, + "loss": 0.1116, + "num_tokens": 1462324307.0, + "reward": 1.0595703125, + "reward_std": 0.381880521774292, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.20135675370693207, "step": 2559 }, { @@ -74226,27 +74226,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 796.73046875, - "completions/mean_terminated_length": 758.9657592773438, - "completions/min_length": 86.0, - "completions/min_terminated_length": 86.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1056.529296875, + "completions/mean_terminated_length": 1009.8956909179688, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, "epoch": 0.8739438422804472, - "grad_norm": 0.9230890870094299, - "kl": 5.16796875, - "learning_rate": 1.430151968604211e-07, - "loss": 0.3276, - "num_tokens": 1369103500.0, - "reward": 1.9501953125, - "reward_std": 0.5184177160263062, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.17729215323925018, + "grad_norm": 1.468004822731018, + "kl": 2.63671875, + "learning_rate": 1.4304730909603305e-07, + "loss": 0.1446, + "num_tokens": 1462946450.0, + "reward": 1.162109375, + "reward_std": 0.41191843152046204, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.18666234612464905, "step": 2560 }, { @@ -74255,27 +74255,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 785.23046875, - "completions/mean_terminated_length": 760.0757446289062, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1095.26171875, + "completions/mean_terminated_length": 1054.5133056640625, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, "epoch": 0.874285226593838, - "grad_norm": 1.4074889421463013, - "kl": 3.5390625, - "learning_rate": 1.4278674829245282e-07, - "loss": 0.2361, - "num_tokens": 1369577762.0, - "reward": 1.9853515625, - "reward_std": 0.3713374137878418, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.927734375, + "grad_norm": 4.126630783081055, + "kl": 2.494140625, + "learning_rate": 1.428186928424625e-07, + "loss": 0.1257, + "num_tokens": 1463579448.0, + "reward": 1.16650390625, + "reward_std": 0.3979930281639099, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, + "rewards/format_reward/mean": 0.072265625, "rewards/format_reward/std": 0.2591804563999176, - "rewards/tag_count_reward/mean": 0.9580078125, - "rewards/tag_count_reward/std": 0.1522638499736786, + "rewards/tag_count_reward/mean": 0.92626953125, + "rewards/tag_count_reward/std": 0.18913322687149048, "step": 2561 }, { @@ -74284,27 +74284,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.09765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1910.0, - "completions/mean_length": 839.52734375, - "completions/mean_terminated_length": 787.8411865234375, - "completions/min_length": 59.0, - "completions/min_terminated_length": 59.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1139.771484375, + "completions/mean_terminated_length": 1041.4783935546875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, "epoch": 0.8746266109072288, - "grad_norm": 3.308898448944092, - "kl": 7.6796875, - "learning_rate": 1.425588776892151e-07, - "loss": 0.4635, - "num_tokens": 1370086176.0, - "reward": 1.900390625, - "reward_std": 0.5681383609771729, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.20466333627700806, + "grad_norm": 6.114365577697754, + "kl": 2.91796875, + "learning_rate": 1.425906549468481e-07, + "loss": 0.1625, + "num_tokens": 1464241587.0, + "reward": 1.158203125, + "reward_std": 0.46258124709129333, + "rewards/accuracy_reward/mean": 0.177734375, + "rewards/accuracy_reward/std": 0.3826628625392914, + "rewards/format_reward/mean": 0.08203125, + "rewards/format_reward/std": 0.2746807038784027, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.2182645946741104, "step": 2562 }, { @@ -74313,27 +74313,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1964.0, - "completions/mean_length": 724.361328125, - "completions/mean_terminated_length": 687.1505737304688, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1004.640625, + "completions/mean_terminated_length": 964.4299926757812, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.8749679952206196, - "grad_norm": 1.095280647277832, - "kl": 5.5, - "learning_rate": 1.423315853741285e-07, - "loss": 0.3733, - "num_tokens": 1370535753.0, - "reward": 1.94873046875, - "reward_std": 0.4907839596271515, - "rewards/accuracy_reward/mean": 0.10080645233392715, - "rewards/accuracy_reward/std": 0.30137622356414795, - "rewards/format_reward/mean": 0.900390625, - "rewards/format_reward/std": 0.29977133870124817, - "rewards/tag_count_reward/mean": 0.95068359375, - "rewards/tag_count_reward/std": 0.15525536239147186, + "grad_norm": 11.661547660827637, + "kl": 2.521484375, + "learning_rate": 1.4236319573309374e-07, + "loss": 0.0627, + "num_tokens": 1464834667.0, + "reward": 1.234375, + "reward_std": 0.3940182328224182, + "rewards/accuracy_reward/mean": 0.19959677755832672, + "rewards/accuracy_reward/std": 0.40010079741477966, + "rewards/format_reward/mean": 0.095703125, + "rewards/format_reward/std": 0.2944713830947876, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.16899466514587402, "step": 2563 }, { @@ -74342,27 +74342,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 799.357421875, - "completions/mean_terminated_length": 761.6719970703125, - "completions/min_length": 86.0, - "completions/min_terminated_length": 86.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1126.82421875, + "completions/mean_terminated_length": 1059.232666015625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, "epoch": 0.8753093795340104, - "grad_norm": 3.3422834873199463, - "kl": 7.0546875, - "learning_rate": 1.4210487166979283e-07, - "loss": 0.4382, - "num_tokens": 1371035856.0, - "reward": 1.81787109375, - "reward_std": 0.534123957157135, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19397689402103424, + "grad_norm": 2.236179828643799, + "kl": 3.09375, + "learning_rate": 1.421363155242809e-07, + "loss": 0.1616, + "num_tokens": 1465502433.0, + "reward": 1.03515625, + "reward_std": 0.4142110347747803, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.22362731397151947, "step": 2564 }, { @@ -74371,27 +74371,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 823.25, - "completions/mean_terminated_length": 776.0486450195312, - "completions/min_length": 46.0, - "completions/min_terminated_length": 46.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1080.033203125, + "completions/mean_terminated_length": 1006.82568359375, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, "epoch": 0.8756507638474013, - "grad_norm": 3.232412338256836, - "kl": 6.9140625, - "learning_rate": 1.4187873689798684e-07, - "loss": 0.4274, - "num_tokens": 1371533728.0, - "reward": 1.83642578125, - "reward_std": 0.5565227270126343, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.1964779943227768, + "grad_norm": 4.514580249786377, + "kl": 2.25, + "learning_rate": 1.4191001464266915e-07, + "loss": 0.1318, + "num_tokens": 1466131778.0, + "reward": 1.13525390625, + "reward_std": 0.3867892622947693, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.18835864961147308, "step": 2565 }, { @@ -74400,27 +74400,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1976.0, - "completions/mean_length": 809.40625, - "completions/mean_terminated_length": 764.2753295898438, - "completions/min_length": 84.0, - "completions/min_terminated_length": 84.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1017.310546875, + "completions/mean_terminated_length": 990.4589233398438, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.875992148160792, - "grad_norm": 2.557537794113159, - "kl": 6.6015625, - "learning_rate": 1.416531813796674e-07, - "loss": 0.4312, - "num_tokens": 1372028720.0, - "reward": 1.80859375, - "reward_std": 0.5407989621162415, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.19705164432525635, + "grad_norm": 4.5749125480651855, + "kl": 2.333984375, + "learning_rate": 1.4168429340969485e-07, + "loss": 0.0852, + "num_tokens": 1466733217.0, + "reward": 1.06787109375, + "reward_std": 0.3650696277618408, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.18625685572624207, "step": 2566 }, { @@ -74429,27 +74429,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 825.794921875, - "completions/mean_terminated_length": 783.8202514648438, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1165.134765625, + "completions/mean_terminated_length": 1092.34033203125, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, "epoch": 0.8763335324741828, - "grad_norm": 1.5472102165222168, - "kl": 5.2734375, - "learning_rate": 1.4142820543496936e-07, - "loss": 0.322, - "num_tokens": 1372530407.0, - "reward": 1.9423828125, - "reward_std": 0.5201644897460938, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.9345703125, - "rewards/tag_count_reward/std": 0.1847042590379715, + "grad_norm": 1.8185319900512695, + "kl": 2.9453125, + "learning_rate": 1.4145915214597114e-07, + "loss": 0.1619, + "num_tokens": 1467408646.0, + "reward": 1.15478515625, + "reward_std": 0.4403513967990875, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.90283203125, + "rewards/tag_count_reward/std": 0.20501726865768433, "step": 2567 }, { @@ -74458,27 +74458,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1862.0, - "completions/mean_length": 820.751953125, - "completions/mean_terminated_length": 793.806396484375, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1096.00390625, + "completions/mean_terminated_length": 1051.2269287109375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, "epoch": 0.8766749167875736, - "grad_norm": 1.468971848487854, - "kl": 4.44921875, - "learning_rate": 1.4120380938320487e-07, - "loss": 0.3006, - "num_tokens": 1373023064.0, - "reward": 1.86376953125, - "reward_std": 0.4445911645889282, - "rewards/accuracy_reward/mean": 0.021484375, - "rewards/accuracy_reward/std": 0.14513419568538666, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.94580078125, - "rewards/tag_count_reward/std": 0.1687902808189392, + "grad_norm": 5.176822662353516, + "kl": 2.17578125, + "learning_rate": 1.4123459117128738e-07, + "loss": 0.084, + "num_tokens": 1468042232.0, + "reward": 1.0498046875, + "reward_std": 0.35536178946495056, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.19067105650901794, "step": 2568 }, { @@ -74487,27 +74487,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1925.0, - "completions/mean_length": 778.3359375, - "completions/mean_terminated_length": 747.864013671875, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1099.076171875, + "completions/mean_terminated_length": 1056.471435546875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.8770163011009644, - "grad_norm": 2.1014490127563477, - "kl": 4.40625, - "learning_rate": 1.4097999354286316e-07, - "loss": 0.2878, - "num_tokens": 1373492564.0, - "reward": 1.86767578125, - "reward_std": 0.45762932300567627, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17180897295475006, + "grad_norm": 4.8754143714904785, + "kl": 2.43359375, + "learning_rate": 1.4101061080460862e-07, + "loss": 0.0986, + "num_tokens": 1468675951.0, + "reward": 1.1181640625, + "reward_std": 0.4465675354003906, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.30236753821372986, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.19663172960281372, "step": 2569 }, { @@ -74516,27 +74516,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1937.0, - "completions/mean_length": 850.48828125, - "completions/mean_terminated_length": 806.854248046875, - "completions/min_length": 71.0, - "completions/min_terminated_length": 71.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1116.935546875, + "completions/mean_terminated_length": 1067.12548828125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.8773576854143552, - "grad_norm": 1.2898650169372559, - "kl": 5.234375, - "learning_rate": 1.4075675823160982e-07, - "loss": 0.3387, - "num_tokens": 1374011678.0, - "reward": 1.89208984375, - "reward_std": 0.43215885758399963, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.94677734375, - "rewards/tag_count_reward/std": 0.16691738367080688, + "grad_norm": 8.132530212402344, + "kl": 2.298828125, + "learning_rate": 1.4078721136407525e-07, + "loss": 0.1701, + "num_tokens": 1469331486.0, + "reward": 1.076171875, + "reward_std": 0.3553844094276428, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.05859375, + "rewards/format_reward/std": 0.23509246110916138, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.1840227097272873, "step": 2570 }, { @@ -74545,27 +74545,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1970.0, - "completions/mean_length": 807.484375, - "completions/mean_terminated_length": 772.6104125976562, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1067.01953125, + "completions/mean_terminated_length": 1035.375, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, "epoch": 0.877699069727746, - "grad_norm": 2.6725640296936035, - "kl": 5.8046875, - "learning_rate": 1.4053410376628647e-07, - "loss": 0.4056, - "num_tokens": 1374494390.0, - "reward": 1.875, - "reward_std": 0.4745900630950928, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.17411890625953674, + "grad_norm": 7.264908313751221, + "kl": 2.404296875, + "learning_rate": 1.4056439316700256e-07, + "loss": 0.0678, + "num_tokens": 1469947080.0, + "reward": 1.0791015625, + "reward_std": 0.3467535972595215, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.9326171875, + "rewards/tag_count_reward/std": 0.17444512248039246, "step": 2571 }, { @@ -74574,27 +74574,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1892.0, - "completions/mean_length": 822.611328125, - "completions/mean_terminated_length": 788.16259765625, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1142.857421875, + "completions/mean_terminated_length": 1064.06591796875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.8780404540411368, - "grad_norm": 3.3297979831695557, - "kl": 5.08203125, - "learning_rate": 1.403120304629106e-07, - "loss": 0.3298, - "num_tokens": 1374999887.0, - "reward": 1.86962890625, - "reward_std": 0.5119246244430542, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.93798828125, - "rewards/tag_count_reward/std": 0.17883963882923126, + "grad_norm": 2.367326498031616, + "kl": 2.345703125, + "learning_rate": 1.4034215652988026e-07, + "loss": 0.1355, + "num_tokens": 1470616543.0, + "reward": 1.05029296875, + "reward_std": 0.33371466398239136, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.91162109375, + "rewards/tag_count_reward/std": 0.2012113630771637, "step": 2572 }, { @@ -74603,27 +74603,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1806.0, - "completions/mean_length": 835.00390625, - "completions/mean_terminated_length": 798.3943481445312, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1104.765625, + "completions/mean_terminated_length": 1060.4007568359375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, "epoch": 0.8783818383545277, - "grad_norm": 0.9163107872009277, - "kl": 5.84375, - "learning_rate": 1.4009053863667448e-07, - "loss": 0.343, - "num_tokens": 1375504465.0, - "reward": 1.89013671875, - "reward_std": 0.5146956443786621, - "rewards/accuracy_reward/mean": 0.08467742055654526, - "rewards/accuracy_reward/std": 0.278682142496109, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.18844488263130188, + "grad_norm": 4.336972713470459, + "kl": 1.791015625, + "learning_rate": 1.4012050176837205e-07, + "loss": 0.0606, + "num_tokens": 1471259239.0, + "reward": 1.134765625, + "reward_std": 0.37284043431282043, + "rewards/accuracy_reward/mean": 0.11290322244167328, + "rewards/accuracy_reward/std": 0.3167939782142639, + "rewards/format_reward/mean": 0.08203125, + "rewards/format_reward/std": 0.2746807038784027, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.15707340836524963, "step": 2573 }, { @@ -74632,27 +74632,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.115234375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 833.171875, - "completions/mean_terminated_length": 801.5230712890625, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1165.580078125, + "completions/mean_terminated_length": 1050.6512451171875, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, "epoch": 0.8787232226679184, - "grad_norm": 1.5819618701934814, - "kl": 7.4765625, - "learning_rate": 1.3986962860194528e-07, - "loss": 0.4639, - "num_tokens": 1376011033.0, - "reward": 1.865234375, - "reward_std": 0.6026434898376465, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.2087314873933792, + "grad_norm": 1.7907490730285645, + "kl": 2.5, + "learning_rate": 1.3989942919731484e-07, + "loss": 0.1496, + "num_tokens": 1471936000.0, + "reward": 1.08447265625, + "reward_std": 0.40894293785095215, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.88134765625, + "rewards/tag_count_reward/std": 0.23916590213775635, "step": 2574 }, { @@ -74661,27 +74661,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 839.353515625, - "completions/mean_terminated_length": 810.3460083007812, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1118.337890625, + "completions/mean_terminated_length": 1045.9219970703125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, "epoch": 0.8790646069813092, - "grad_norm": 1.3439744710922241, - "kl": 6.078125, - "learning_rate": 1.396493006722645e-07, - "loss": 0.3989, - "num_tokens": 1376518014.0, - "reward": 1.90966796875, - "reward_std": 0.4882264733314514, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.1729232519865036, + "grad_norm": 2.558666467666626, + "kl": 2.142578125, + "learning_rate": 1.3967893913071898e-07, + "loss": 0.0975, + "num_tokens": 1472585821.0, + "reward": 1.11376953125, + "reward_std": 0.39217886328697205, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.08203125, + "rewards/format_reward/std": 0.2746807038784027, + "rewards/tag_count_reward/mean": 0.91259765625, + "rewards/tag_count_reward/std": 0.19981031119823456, "step": 2575 }, { @@ -74690,27 +74690,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 840.857421875, - "completions/mean_terminated_length": 784.0797119140625, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1136.1484375, + "completions/mean_terminated_length": 1048.2825927734375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, "epoch": 0.8794059912947, - "grad_norm": 1.4315595626831055, - "kl": 7.328125, - "learning_rate": 1.3942955516034715e-07, - "loss": 0.4431, - "num_tokens": 1377022229.0, - "reward": 1.8798828125, - "reward_std": 0.5710894465446472, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310528099536896, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.21561963856220245, + "grad_norm": 7.2972588539123535, + "kl": 2.2890625, + "learning_rate": 1.3945903188176719e-07, + "loss": 0.1348, + "num_tokens": 1473241225.0, + "reward": 1.1259765625, + "reward_std": 0.4241016209125519, + "rewards/accuracy_reward/mean": 0.16330644488334656, + "rewards/accuracy_reward/std": 0.37001824378967285, + "rewards/format_reward/mean": 0.076171875, + "rewards/format_reward/std": 0.26553234457969666, + "rewards/tag_count_reward/mean": 0.8916015625, + "rewards/tag_count_reward/std": 0.2238643318414688, "step": 2576 }, { @@ -74719,27 +74719,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 857.81640625, - "completions/mean_terminated_length": 814.4494018554688, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1093.232421875, + "completions/mean_terminated_length": 1040.0804443359375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.8797473756080908, - "grad_norm": 1.4523320198059082, - "kl": 6.8515625, - "learning_rate": 1.3921039237808198e-07, - "loss": 0.4524, - "num_tokens": 1377537447.0, - "reward": 1.86669921875, - "reward_std": 0.5333712100982666, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.18445254862308502, + "grad_norm": 52.69700241088867, + "kl": 2.69140625, + "learning_rate": 1.3923970776281452e-07, + "loss": 0.1308, + "num_tokens": 1473876976.0, + "reward": 1.1044921875, + "reward_std": 0.4066890478134155, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.07421875, + "rewards/format_reward/std": 0.2623828947544098, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.19871997833251953, "step": 2577 }, { @@ -74748,27 +74748,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 812.955078125, - "completions/mean_terminated_length": 778.2349243164062, - "completions/min_length": 52.0, - "completions/min_terminated_length": 52.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1183.349609375, + "completions/mean_terminated_length": 1093.9029541015625, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, "epoch": 0.8800887599214816, - "grad_norm": 2.0251107215881348, - "kl": 5.703125, - "learning_rate": 1.3899181263653026e-07, - "loss": 0.3532, - "num_tokens": 1378044208.0, - "reward": 1.8525390625, - "reward_std": 0.5274684429168701, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.9306640625, - "rewards/tag_count_reward/std": 0.19175048172473907, + "grad_norm": 3.216992139816284, + "kl": 2.484375, + "learning_rate": 1.3902096708538762e-07, + "loss": 0.1625, + "num_tokens": 1474573379.0, + "reward": 1.0625, + "reward_std": 0.43896353244781494, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29176566004753113, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.23483219742774963, "step": 2578 }, { @@ -74777,27 +74777,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1912.0, - "completions/mean_length": 843.6953125, - "completions/mean_terminated_length": 794.7398071289062, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1093.90625, + "completions/mean_terminated_length": 1021.7479248046875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, "epoch": 0.8804301442348724, - "grad_norm": 1.2781013250350952, - "kl": 8.453125, - "learning_rate": 1.3877381624592616e-07, - "loss": 0.5084, - "num_tokens": 1378553124.0, - "reward": 1.84619140625, - "reward_std": 0.6146891117095947, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.20686857402324677, + "grad_norm": 2.597442388534546, + "kl": 2.080078125, + "learning_rate": 1.3880281016018455e-07, + "loss": 0.1229, + "num_tokens": 1475210403.0, + "reward": 1.12060546875, + "reward_std": 0.39143675565719604, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.91162109375, + "rewards/tag_count_reward/std": 0.2077903300523758, "step": 2579 }, { @@ -74806,27 +74806,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 826.57421875, - "completions/mean_terminated_length": 779.5009765625, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1070.03515625, + "completions/mean_terminated_length": 1013.4586181640625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.8807715285482632, - "grad_norm": 2.706282138824463, - "kl": 7.61328125, - "learning_rate": 1.3855640351567553e-07, - "loss": 0.4517, - "num_tokens": 1379047098.0, - "reward": 1.9033203125, - "reward_std": 0.614270031452179, - "rewards/accuracy_reward/mean": 0.14453125, - "rewards/accuracy_reward/std": 0.35197147727012634, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.19744645059108734, + "grad_norm": 6.768167972564697, + "kl": 1.873046875, + "learning_rate": 1.3858523729707402e-07, + "loss": 0.1352, + "num_tokens": 1475829029.0, + "reward": 1.10986328125, + "reward_std": 0.3886840045452118, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.04296875, + "rewards/format_reward/std": 0.2029850035905838, + "rewards/tag_count_reward/mean": 0.90673828125, + "rewards/tag_count_reward/std": 0.20444786548614502, "step": 2580 }, { @@ -74835,27 +74835,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 825.232421875, - "completions/mean_terminated_length": 785.7882690429688, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1098.67578125, + "completions/mean_terminated_length": 1013.842529296875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.881112912861654, - "grad_norm": 1.6486424207687378, - "kl": 8.2578125, - "learning_rate": 1.3833957475435613e-07, - "loss": 0.4868, - "num_tokens": 1379555665.0, - "reward": 1.755859375, - "reward_std": 0.5890187621116638, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.2144487351179123, + "grad_norm": 4.697291374206543, + "kl": 2.8828125, + "learning_rate": 1.3836824880509543e-07, + "loss": 0.1806, + "num_tokens": 1476477599.0, + "reward": 1.02490234375, + "reward_std": 0.35676270723342896, + "rewards/accuracy_reward/mean": 0.058467742055654526, + "rewards/accuracy_reward/std": 0.23486268520355225, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.89794921875, + "rewards/tag_count_reward/std": 0.21999086439609528, "step": 2581 }, { @@ -74864,27 +74864,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 864.81640625, - "completions/mean_terminated_length": 801.5184936523438, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1095.65234375, + "completions/mean_terminated_length": 1021.4694213867188, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, "epoch": 0.8814542971750448, - "grad_norm": 2.1603972911834717, - "kl": 8.6171875, - "learning_rate": 1.3812333026971663e-07, - "loss": 0.527, - "num_tokens": 1380081219.0, - "reward": 1.74462890625, - "reward_std": 0.5655951499938965, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.810546875, - "rewards/format_reward/std": 0.3922513723373413, - "rewards/tag_count_reward/mean": 0.90673828125, - "rewards/tag_count_reward/std": 0.21323274075984955, + "grad_norm": 8.37886905670166, + "kl": 2.025390625, + "learning_rate": 1.3815184499245775e-07, + "loss": 0.1563, + "num_tokens": 1477121341.0, + "reward": 0.99609375, + "reward_std": 0.31779083609580994, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.1939331740140915, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.21269488334655762, "step": 2582 }, { @@ -74893,27 +74893,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1960.0, - "completions/mean_length": 871.83203125, - "completions/mean_terminated_length": 811.453857421875, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1130.4609375, + "completions/mean_terminated_length": 1054.8076171875, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, "epoch": 0.8817956814884356, - "grad_norm": 1.896267294883728, - "kl": 7.8515625, - "learning_rate": 1.3790767036867645e-07, - "loss": 0.4951, - "num_tokens": 1380610269.0, - "reward": 1.79296875, - "reward_std": 0.6150627136230469, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.90234375, - "rewards/tag_count_reward/std": 0.22225576639175415, + "grad_norm": 2.9886717796325684, + "kl": 2.90625, + "learning_rate": 1.3793602616653977e-07, + "loss": 0.1757, + "num_tokens": 1477782809.0, + "reward": 1.0625, + "reward_std": 0.422885000705719, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.890625, + "rewards/tag_count_reward/std": 0.22502446174621582, "step": 2583 }, { @@ -74922,27 +74922,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, - "completions/mean_length": 857.380859375, - "completions/mean_terminated_length": 823.9096069335938, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/mean_length": 1088.17578125, + "completions/mean_terminated_length": 1017.7484130859375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.8821370658018264, - "grad_norm": 1.679373860359192, - "kl": 6.578125, - "learning_rate": 1.3769259535732561e-07, - "loss": 0.4107, - "num_tokens": 1381125728.0, - "reward": 1.89794921875, - "reward_std": 0.6069402694702148, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.1981005072593689, + "grad_norm": 5.442252159118652, + "kl": 2.423828125, + "learning_rate": 1.377207926338894e-07, + "loss": 0.1481, + "num_tokens": 1478416435.0, + "reward": 1.17431640625, + "reward_std": 0.41693174839019775, + "rewards/accuracy_reward/mean": 0.220703125, + "rewards/accuracy_reward/std": 0.4151262938976288, + "rewards/format_reward/mean": 0.048828125, + "rewards/format_reward/std": 0.2157193273305893, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.2146575003862381, "step": 2584 }, { @@ -74951,27 +74951,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1908.0, - "completions/mean_length": 882.2265625, - "completions/mean_terminated_length": 824.8933715820312, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1148.48828125, + "completions/mean_terminated_length": 1094.4803466796875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.8824784501152172, - "grad_norm": 2.3373751640319824, - "kl": 6.546875, - "learning_rate": 1.374781055409235e-07, - "loss": 0.4046, - "num_tokens": 1381648676.0, - "reward": 1.85546875, - "reward_std": 0.5630089640617371, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.20833741128444672, + "grad_norm": 5.479371070861816, + "kl": 2.103515625, + "learning_rate": 1.37506144700223e-07, + "loss": 0.0854, + "num_tokens": 1479075709.0, + "reward": 1.09814453125, + "reward_std": 0.3576146066188812, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.1780041754245758, "step": 2585 }, { @@ -74980,27 +74980,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1974.0, - "completions/mean_length": 834.037109375, - "completions/mean_terminated_length": 761.1491088867188, - "completions/min_length": 62.0, - "completions/min_terminated_length": 62.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1145.599609375, + "completions/mean_terminated_length": 1054.3892822265625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.882819834428608, - "grad_norm": 1.7196927070617676, - "kl": 6.6640625, - "learning_rate": 1.372642012238993e-07, - "loss": 0.4339, - "num_tokens": 1382149511.0, - "reward": 1.8330078125, - "reward_std": 0.5458844900131226, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.2043900191783905, + "grad_norm": 1.8868467807769775, + "kl": 2.41015625, + "learning_rate": 1.3729208267042524e-07, + "loss": 0.1494, + "num_tokens": 1479736064.0, + "reward": 1.0986328125, + "reward_std": 0.42256882786750793, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.076171875, + "rewards/format_reward/std": 0.26553234457969666, + "rewards/tag_count_reward/mean": 0.8955078125, + "rewards/tag_count_reward/std": 0.21911880373954773, "step": 2586 }, { @@ -75009,27 +75009,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 851.55859375, - "completions/mean_terminated_length": 802.9227294921875, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1077.48046875, + "completions/mean_terminated_length": 1004.0798950195312, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, "epoch": 0.8831612187419988, - "grad_norm": 2.913691759109497, - "kl": 6.65625, - "learning_rate": 1.3705088270985103e-07, - "loss": 0.457, - "num_tokens": 1382662069.0, - "reward": 1.82958984375, - "reward_std": 0.5881297588348389, - "rewards/accuracy_reward/mean": 0.08467742055654526, - "rewards/accuracy_reward/std": 0.278682142496109, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.21267634630203247, + "grad_norm": 2.0429177284240723, + "kl": 1.9111328125, + "learning_rate": 1.3707860684854872e-07, + "loss": 0.0989, + "num_tokens": 1480364294.0, + "reward": 1.1416015625, + "reward_std": 0.3803124725818634, + "rewards/accuracy_reward/mean": 0.14717741310596466, + "rewards/accuracy_reward/std": 0.3546403646469116, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.19495287537574768, "step": 2587 }, { @@ -75038,27 +75038,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 881.123046875, - "completions/mean_terminated_length": 816.1629028320312, - "completions/min_length": 6.0, - "completions/min_terminated_length": 6.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1125.865234375, + "completions/mean_terminated_length": 1054.0357666015625, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, "epoch": 0.8835026030553896, - "grad_norm": 2.5694093704223633, - "kl": 6.5703125, - "learning_rate": 1.3683815030154538e-07, - "loss": 0.4404, - "num_tokens": 1383187316.0, - "reward": 1.7939453125, - "reward_std": 0.6087914705276489, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.2081148475408554, + "grad_norm": 1.641013264656067, + "kl": 2.01171875, + "learning_rate": 1.3686571753781302e-07, + "loss": 0.1268, + "num_tokens": 1481014849.0, + "reward": 1.06298828125, + "reward_std": 0.38058561086654663, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.19151799380779266, "step": 2588 }, { @@ -75067,27 +75067,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 878.44140625, - "completions/mean_terminated_length": 810.7809448242188, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1181.50390625, + "completions/mean_terminated_length": 1106.0765380859375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, "epoch": 0.8838439873687804, - "grad_norm": 2.169834613800049, - "kl": 6.734375, - "learning_rate": 1.3662600430091707e-07, - "loss": 0.4543, - "num_tokens": 1383720246.0, - "reward": 1.78662109375, - "reward_std": 0.5730187892913818, - "rewards/accuracy_reward/mean": 0.0463709682226181, - "rewards/accuracy_reward/std": 0.21049949526786804, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.91357421875, - "rewards/tag_count_reward/std": 0.21152304112911224, + "grad_norm": 6.263635158538818, + "kl": 2.072265625, + "learning_rate": 1.36653415040605e-07, + "loss": 0.1046, + "num_tokens": 1481702947.0, + "reward": 1.0849609375, + "reward_std": 0.42133569717407227, + "rewards/accuracy_reward/mean": 0.11088709533214569, + "rewards/accuracy_reward/std": 0.3143092691898346, + "rewards/format_reward/mean": 0.076171875, + "rewards/format_reward/std": 0.26553234457969666, + "rewards/tag_count_reward/mean": 0.9013671875, + "rewards/tag_count_reward/std": 0.21396473050117493, "step": 2589 }, { @@ -75096,27 +75096,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 864.8125, - "completions/mean_terminated_length": 819.2129516601562, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1121.6015625, + "completions/mean_terminated_length": 1057.7786865234375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.8841853716821712, - "grad_norm": 1.9821420907974243, - "kl": 6.7578125, - "learning_rate": 1.3641444500906846e-07, - "loss": 0.4474, - "num_tokens": 1384244406.0, - "reward": 1.810546875, - "reward_std": 0.5634486675262451, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.21601147949695587, + "grad_norm": 3.618290662765503, + "kl": 2.6953125, + "learning_rate": 1.3644169965847787e-07, + "loss": 0.1316, + "num_tokens": 1482358583.0, + "reward": 1.07763671875, + "reward_std": 0.39267897605895996, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.92138671875, + "rewards/tag_count_reward/std": 0.19357748329639435, "step": 2590 }, { @@ -75127,25 +75127,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 852.43359375, - "completions/mean_terminated_length": 796.2003784179688, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1044.71484375, + "completions/mean_terminated_length": 997.5255126953125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.884526755995562, - "grad_norm": 1.7044569253921509, - "kl": 7.09375, - "learning_rate": 1.3620347272626933e-07, - "loss": 0.447, - "num_tokens": 1384750324.0, - "reward": 1.857421875, - "reward_std": 0.5381364822387695, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.91796875, - "rewards/tag_count_reward/std": 0.20478467643260956, + "grad_norm": 2.9081003665924072, + "kl": 2.037109375, + "learning_rate": 1.3623057169215102e-07, + "loss": 0.0992, + "num_tokens": 1482962949.0, + "reward": 1.15185546875, + "reward_std": 0.3780638575553894, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.92333984375, + "rewards/tag_count_reward/std": 0.18066298961639404, "step": 2591 }, { @@ -75154,27 +75154,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1850.0, - "completions/mean_length": 818.771484375, - "completions/mean_terminated_length": 760.9550170898438, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1033.685546875, + "completions/mean_terminated_length": 994.5942993164062, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, "epoch": 0.8848681403089528, - "grad_norm": 1.2714744806289673, - "kl": 7.65625, - "learning_rate": 1.359930877519562e-07, - "loss": 0.4809, - "num_tokens": 1385238815.0, - "reward": 1.84912109375, - "reward_std": 0.5803292989730835, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.19979596138000488, + "grad_norm": 1.4806687831878662, + "kl": 1.90234375, + "learning_rate": 1.3602003144150926e-07, + "loss": 0.0857, + "num_tokens": 1483561476.0, + "reward": 1.13720703125, + "reward_std": 0.4037706255912781, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.93212890625, + "rewards/tag_count_reward/std": 0.17178115248680115, "step": 2592 }, { @@ -75183,27 +75183,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1976.0, - "completions/mean_length": 842.171875, - "completions/mean_terminated_length": 777.6625366210938, - "completions/min_length": 88.0, - "completions/min_terminated_length": 88.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1100.26171875, + "completions/mean_terminated_length": 1055.68505859375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.8852095246223436, - "grad_norm": 1.324734091758728, - "kl": 5.6796875, - "learning_rate": 1.3578329038473222e-07, - "loss": 0.3821, - "num_tokens": 1385745831.0, - "reward": 1.8408203125, - "reward_std": 0.49896273016929626, - "rewards/accuracy_reward/mean": 0.05645161122083664, - "rewards/accuracy_reward/std": 0.23102475702762604, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.1972333937883377, + "grad_norm": 5.997927188873291, + "kl": 2.296875, + "learning_rate": 1.3581007920560281e-07, + "loss": 0.0993, + "num_tokens": 1484200634.0, + "reward": 1.08251953125, + "reward_std": 0.38572585582733154, + "rewards/accuracy_reward/mean": 0.060483869165182114, + "rewards/accuracy_reward/std": 0.2386218160390854, + "rewards/format_reward/mean": 0.095703125, + "rewards/format_reward/std": 0.2944713830947876, + "rewards/tag_count_reward/mean": 0.92822265625, + "rewards/tag_count_reward/std": 0.17722409963607788, "step": 2593 }, { @@ -75212,27 +75212,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 788.927734375, - "completions/mean_terminated_length": 756.1262817382812, - "completions/min_length": 94.0, - "completions/min_terminated_length": 94.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1042.53125, + "completions/mean_terminated_length": 977.729736328125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, "epoch": 0.8855509089357344, - "grad_norm": 1.3588322401046753, - "kl": 7.2421875, - "learning_rate": 1.355740809223662e-07, - "loss": 0.4772, - "num_tokens": 1386223330.0, - "reward": 1.92333984375, - "reward_std": 0.5310149788856506, - "rewards/accuracy_reward/mean": 0.1328125, - "rewards/accuracy_reward/std": 0.33970388770103455, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.19089330732822418, + "grad_norm": 5.953350067138672, + "kl": 3.009765625, + "learning_rate": 1.3560071528264653e-07, + "loss": 0.1387, + "num_tokens": 1484807978.0, + "reward": 1.1689453125, + "reward_std": 0.43317079544067383, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.9189453125, + "rewards/tag_count_reward/std": 0.1954032927751541, "step": 2594 }, { @@ -75241,27 +75241,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 852.869140625, - "completions/mean_terminated_length": 783.7293090820312, - "completions/min_length": 225.0, - "completions/min_terminated_length": 225.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1087.54296875, + "completions/mean_terminated_length": 1048.5, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, "epoch": 0.8858922932491252, - "grad_norm": 1.2075051069259644, - "kl": 6.9765625, - "learning_rate": 1.3536545966179274e-07, - "loss": 0.4607, - "num_tokens": 1386729151.0, - "reward": 1.85302734375, - "reward_std": 0.4970835745334625, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.19595204293727875, + "grad_norm": 2.646212339401245, + "kl": 2.111328125, + "learning_rate": 1.3539193997001976e-07, + "loss": 0.1032, + "num_tokens": 1485433952.0, + "reward": 1.11865234375, + "reward_std": 0.36381471157073975, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.93701171875, + "rewards/tag_count_reward/std": 0.17079374194145203, "step": 2595 }, { @@ -75270,27 +75270,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 890.853515625, - "completions/mean_terminated_length": 836.4273681640625, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1163.26953125, + "completions/mean_terminated_length": 1102.3173828125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, "epoch": 0.886233677562516, - "grad_norm": 0.8362195491790771, - "kl": 6.6953125, - "learning_rate": 1.3515742689911166e-07, - "loss": 0.4196, - "num_tokens": 1387257892.0, - "reward": 1.8447265625, - "reward_std": 0.5106940269470215, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.19908507168293, + "grad_norm": 5.337454319000244, + "kl": 2.9765625, + "learning_rate": 1.351837535642657e-07, + "loss": 0.1189, + "num_tokens": 1486102170.0, + "reward": 1.02783203125, + "reward_std": 0.4115527868270874, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.88330078125, + "rewards/tag_count_reward/std": 0.2286466658115387, "step": 2596 }, { @@ -75299,27 +75299,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 763.521484375, - "completions/mean_terminated_length": 730.05810546875, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1085.591796875, + "completions/mean_terminated_length": 1025.69091796875, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, "epoch": 0.8865750618759068, - "grad_norm": 0.9312205910682678, - "kl": 4.65625, - "learning_rate": 1.3494998292958725e-07, - "loss": 0.2835, - "num_tokens": 1387725487.0, - "reward": 1.89501953125, - "reward_std": 0.4495265483856201, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.94384765625, - "rewards/tag_count_reward/std": 0.17387108504772186, + "grad_norm": 4.097114086151123, + "kl": 1.826171875, + "learning_rate": 1.3497615636109124e-07, + "loss": 0.1159, + "num_tokens": 1486734665.0, + "reward": 1.07080078125, + "reward_std": 0.34852519631385803, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.92236328125, + "rewards/tag_count_reward/std": 0.18690726161003113, "step": 2597 }, { @@ -75328,27 +75328,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1930.0, - "completions/mean_length": 846.666015625, - "completions/mean_terminated_length": 810.408447265625, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1104.447265625, + "completions/mean_terminated_length": 1066.0914306640625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, "epoch": 0.8869164461892975, - "grad_norm": 1.2470439672470093, - "kl": 7.0703125, - "learning_rate": 1.3474312804764853e-07, - "loss": 0.4707, - "num_tokens": 1388233700.0, - "reward": 1.84033203125, - "reward_std": 0.5441080331802368, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.19630283117294312, + "grad_norm": 4.098260879516602, + "kl": 1.693359375, + "learning_rate": 1.3476914865536608e-07, + "loss": 0.0579, + "num_tokens": 1487374862.0, + "reward": 1.052734375, + "reward_std": 0.34565094113349915, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.16699250042438507, "step": 2598 }, { @@ -75357,27 +75357,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 819.166015625, - "completions/mean_terminated_length": 763.9938354492188, - "completions/min_length": 60.0, - "completions/min_terminated_length": 60.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1085.302734375, + "completions/mean_terminated_length": 1027.5010986328125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.8872578305026884, - "grad_norm": 1.1735024452209473, - "kl": 6.5625, - "learning_rate": 1.345368625468879e-07, - "loss": 0.4053, - "num_tokens": 1388731097.0, - "reward": 1.888671875, - "reward_std": 0.499426007270813, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.1810387820005417, + "grad_norm": 6.141515731811523, + "kl": 2.33203125, + "learning_rate": 1.3456273074112287e-07, + "loss": 0.0997, + "num_tokens": 1488008521.0, + "reward": 1.09375, + "reward_std": 0.38844335079193115, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.1852131336927414, "step": 2599 }, { @@ -75386,27 +75386,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1895.0, - "completions/mean_length": 827.431640625, - "completions/mean_terminated_length": 788.0584716796875, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1182.40625, + "completions/mean_terminated_length": 1078.23193359375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.8875992148160792, - "grad_norm": 0.7364870309829712, - "kl": 7.4296875, - "learning_rate": 1.3433118672006173e-07, - "loss": 0.4804, - "num_tokens": 1389228198.0, - "reward": 1.89599609375, - "reward_std": 0.5626035928726196, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.1838454306125641, + "grad_norm": 2.642157554626465, + "kl": 3.15234375, + "learning_rate": 1.3435690291155627e-07, + "loss": 0.1766, + "num_tokens": 1488687369.0, + "reward": 1.1142578125, + "reward_std": 0.4052448570728302, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.9091796875, + "rewards/tag_count_reward/std": 0.21683931350708008, "step": 2600 }, { @@ -75415,27 +75415,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 824.7578125, - "completions/mean_terminated_length": 797.9002075195312, - "completions/min_length": 71.0, - "completions/min_terminated_length": 71.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1183.5078125, + "completions/mean_terminated_length": 1116.1683349609375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.88794059912947, - "grad_norm": 0.8736045360565186, - "kl": 5.765625, - "learning_rate": 1.3412610085908912e-07, - "loss": 0.3384, - "num_tokens": 1389723914.0, - "reward": 1.86865234375, - "reward_std": 0.4505782723426819, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.94677734375, - "rewards/tag_count_reward/std": 0.16837652027606964, + "grad_norm": 3.0880329608917236, + "kl": 2.154296875, + "learning_rate": 1.341516654590231e-07, + "loss": 0.1188, + "num_tokens": 1489366765.0, + "reward": 1.064453125, + "reward_std": 0.36538228392601013, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.08984375, + "rewards/format_reward/std": 0.2862374484539032, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.18816134333610535, "step": 2601 }, { @@ -75444,27 +75444,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 804.08203125, - "completions/mean_terminated_length": 766.5391845703125, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1095.716796875, + "completions/mean_terminated_length": 1050.9263916015625, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, "epoch": 0.8882819834428608, - "grad_norm": 1.735905647277832, - "kl": 6.46875, - "learning_rate": 1.3392160525505191e-07, - "loss": 0.4185, - "num_tokens": 1390210644.0, - "reward": 1.97119140625, - "reward_std": 0.5567293167114258, - "rewards/accuracy_reward/mean": 0.13709677755832672, - "rewards/accuracy_reward/std": 0.34429675340652466, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.1794690042734146, + "grad_norm": 1.7701585292816162, + "kl": 1.544921875, + "learning_rate": 1.339470186750413e-07, + "loss": 0.0684, + "num_tokens": 1490002812.0, + "reward": 1.2080078125, + "reward_std": 0.4486067593097687, + "rewards/accuracy_reward/mean": 0.2177419364452362, + "rewards/accuracy_reward/std": 0.41312772035598755, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.9248046875, + "rewards/tag_count_reward/std": 0.18822988867759705, "step": 2602 }, { @@ -75473,27 +75473,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1984.0, - "completions/mean_length": 764.5546875, - "completions/mean_terminated_length": 738.9880981445312, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1036.943359375, + "completions/mean_terminated_length": 989.3885498046875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.8886233677562516, - "grad_norm": 1.4352848529815674, - "kl": 6.046875, - "learning_rate": 1.3371770019819433e-07, - "loss": 0.3766, - "num_tokens": 1390676720.0, - "reward": 1.90673828125, - "reward_std": 0.46058928966522217, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.18217463791370392, + "grad_norm": 3.172309398651123, + "kl": 2.37890625, + "learning_rate": 1.3374296285029014e-07, + "loss": 0.131, + "num_tokens": 1490608351.0, + "reward": 1.1083984375, + "reward_std": 0.3886687755584717, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.19495287537574768, "step": 2603 }, { @@ -75502,27 +75502,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1789.0, - "completions/mean_length": 777.220703125, - "completions/mean_terminated_length": 744.1142578125, - "completions/min_length": 179.0, - "completions/min_terminated_length": 179.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1055.9765625, + "completions/mean_terminated_length": 1009.3169555664062, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, "epoch": 0.8889647520696424, - "grad_norm": 0.971156120300293, - "kl": 6.19140625, - "learning_rate": 1.3351438597792218e-07, - "loss": 0.3897, - "num_tokens": 1391146785.0, - "reward": 1.93408203125, - "reward_std": 0.47433918714523315, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.93994140625, - "rewards/tag_count_reward/std": 0.18880455195903778, + "grad_norm": 1.8199608325958252, + "kl": 1.73828125, + "learning_rate": 1.335394982746091e-07, + "loss": 0.0803, + "num_tokens": 1491221139.0, + "reward": 1.177734375, + "reward_std": 0.3965303897857666, + "rewards/accuracy_reward/mean": 0.166015625, + "rewards/accuracy_reward/std": 0.3724585771560669, + "rewards/format_reward/mean": 0.080078125, + "rewards/format_reward/std": 0.271679550409317, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.1782301664352417, "step": 2604 }, { @@ -75531,27 +75531,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1734.0, - "completions/mean_length": 726.63671875, - "completions/mean_terminated_length": 708.32080078125, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1024.482421875, + "completions/mean_terminated_length": 974.1454467773438, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.8893061363830332, - "grad_norm": 0.963898777961731, - "kl": 4.296875, - "learning_rate": 1.3331166288280295e-07, - "loss": 0.2596, - "num_tokens": 1391593767.0, - "reward": 1.9560546875, - "reward_std": 0.44860565662384033, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.9140625, - "rewards/format_reward/std": 0.28054583072662354, - "rewards/tag_count_reward/mean": 0.9580078125, - "rewards/tag_count_reward/std": 0.1448541134595871, + "grad_norm": 2.8290622234344482, + "kl": 2.4765625, + "learning_rate": 1.333366252369983e-07, + "loss": 0.1535, + "num_tokens": 1491820618.0, + "reward": 1.1318359375, + "reward_std": 0.376934677362442, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.05078125, + "rewards/format_reward/std": 0.21976542472839355, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.18808770179748535, "step": 2605 }, { @@ -75560,27 +75560,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 794.787109375, - "completions/mean_terminated_length": 767.271484375, - "completions/min_length": 5.0, - "completions/min_terminated_length": 5.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1082.17578125, + "completions/mean_terminated_length": 1017.7875366210938, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.889647520696424, - "grad_norm": 1.4902775287628174, - "kl": 5.7890625, - "learning_rate": 1.3310953120056488e-07, - "loss": 0.3435, - "num_tokens": 1392072906.0, - "reward": 2.00146484375, - "reward_std": 0.5466192364692688, - "rewards/accuracy_reward/mean": 0.166015625, - "rewards/accuracy_reward/std": 0.3724585771560669, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.94287109375, - "rewards/tag_count_reward/std": 0.17841704189777374, + "grad_norm": 6.18074893951416, + "kl": 2.18359375, + "learning_rate": 1.3313434402561725e-07, + "loss": 0.1652, + "num_tokens": 1492446900.0, + "reward": 1.22314453125, + "reward_std": 0.39326757192611694, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.42882615327835083, + "rewards/format_reward/mean": 0.056640625, + "rewards/format_reward/std": 0.23138070106506348, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.18174928426742554, "step": 2606 }, { @@ -75589,27 +75589,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1941.0, - "completions/mean_length": 809.595703125, - "completions/mean_terminated_length": 761.8681640625, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1104.5859375, + "completions/mean_terminated_length": 1015.888916015625, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, "epoch": 0.8899889050098148, - "grad_norm": 1.5362666845321655, - "kl": 7.359375, - "learning_rate": 1.3290799121809702e-07, - "loss": 0.4408, - "num_tokens": 1392561467.0, - "reward": 1.8310546875, - "reward_std": 0.5468693375587463, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.9267578125, - "rewards/tag_count_reward/std": 0.1959892213344574, + "grad_norm": 6.825827121734619, + "kl": 2.22265625, + "learning_rate": 1.3293265492778502e-07, + "loss": 0.1564, + "num_tokens": 1493086496.0, + "reward": 1.0810546875, + "reward_std": 0.3840760588645935, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.8994140625, + "rewards/tag_count_reward/std": 0.21703311800956726, "step": 2607 }, { @@ -75618,27 +75618,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 873.70703125, - "completions/mean_terminated_length": 835.8265991210938, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1172.501953125, + "completions/mean_terminated_length": 1116.076904296875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, "epoch": 0.8903302893232056, - "grad_norm": 1.9522093534469604, - "kl": 6.46875, - "learning_rate": 1.3270704322144832e-07, - "loss": 0.3822, - "num_tokens": 1393086069.0, - "reward": 1.8466796875, - "reward_std": 0.49706533551216125, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.9306640625, - "rewards/tag_count_reward/std": 0.19238728284835815, + "grad_norm": 4.828230857849121, + "kl": 1.912109375, + "learning_rate": 1.3273155822997975e-07, + "loss": 0.0708, + "num_tokens": 1493764081.0, + "reward": 1.07763671875, + "reward_std": 0.38608235120773315, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.92724609375, + "rewards/tag_count_reward/std": 0.17957013845443726, "step": 2608 }, { @@ -75647,27 +75647,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1944.0, - "completions/mean_length": 824.224609375, - "completions/mean_terminated_length": 784.7479858398438, - "completions/min_length": 85.0, - "completions/min_terminated_length": 85.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1070.37109375, + "completions/mean_terminated_length": 991.9957275390625, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, "epoch": 0.8906716736365964, - "grad_norm": 1.0797889232635498, - "kl": 7.1015625, - "learning_rate": 1.3250668749582782e-07, - "loss": 0.4339, - "num_tokens": 1393581656.0, - "reward": 1.9013671875, - "reward_std": 0.5164161324501038, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.18339595198631287, + "grad_norm": 2.175081491470337, + "kl": 2.541015625, + "learning_rate": 1.3253105421783794e-07, + "loss": 0.1681, + "num_tokens": 1494385695.0, + "reward": 1.16650390625, + "reward_std": 0.4186092019081116, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.1942574828863144, "step": 2609 }, { @@ -75676,27 +75676,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 807.873046875, - "completions/mean_terminated_length": 783.1693725585938, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1165.125, + "completions/mean_terminated_length": 1090.3050537109375, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, "epoch": 0.8910130579499872, - "grad_norm": 1.6566232442855835, - "kl": 5.03125, - "learning_rate": 1.3230692432560403e-07, - "loss": 0.3093, - "num_tokens": 1394069015.0, - "reward": 1.88427734375, - "reward_std": 0.5347901582717896, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.16640710830688477, + "grad_norm": 4.508205890655518, + "kl": 2.671875, + "learning_rate": 1.3233114317615436e-07, + "loss": 0.1315, + "num_tokens": 1495055967.0, + "reward": 1.12841796875, + "reward_std": 0.4585619866847992, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.08203125, + "rewards/format_reward/std": 0.2746807038784027, + "rewards/tag_count_reward/mean": 0.90576171875, + "rewards/tag_count_reward/std": 0.2093251794576645, "step": 2610 }, { @@ -75705,27 +75705,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 825.880859375, - "completions/mean_terminated_length": 806.482177734375, - "completions/min_length": 60.0, - "completions/min_terminated_length": 60.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1165.01953125, + "completions/mean_terminated_length": 1086.1148681640625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.891354442263378, - "grad_norm": 1.545709490776062, - "kl": 5.796875, - "learning_rate": 1.321077539943039e-07, - "loss": 0.3447, - "num_tokens": 1394574554.0, - "reward": 1.861328125, - "reward_std": 0.5165987014770508, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.18567688763141632, + "grad_norm": 2.893400192260742, + "kl": 2.41796875, + "learning_rate": 1.3213182538888146e-07, + "loss": 0.1541, + "num_tokens": 1495735145.0, + "reward": 1.07421875, + "reward_std": 0.3993414342403412, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.08984375, + "rewards/format_reward/std": 0.2862374484539032, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.21027308702468872, "step": 2611 }, { @@ -75734,27 +75734,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1865.0, - "completions/mean_length": 823.744140625, - "completions/mean_terminated_length": 786.7947387695312, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1105.142578125, + "completions/mean_terminated_length": 1040.185791015625, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, "epoch": 0.8916958265767688, - "grad_norm": 0.9597628712654114, - "kl": 6.40625, - "learning_rate": 1.319091767846136e-07, - "loss": 0.3888, - "num_tokens": 1395068855.0, - "reward": 1.8359375, - "reward_std": 0.5429961681365967, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.1992122381925583, + "grad_norm": 2.8259124755859375, + "kl": 2.91015625, + "learning_rate": 1.31933101139129e-07, + "loss": 0.1368, + "num_tokens": 1496373522.0, + "reward": 1.076171875, + "reward_std": 0.3546481430530548, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.900390625, + "rewards/tag_count_reward/std": 0.22138561308383942, "step": 2612 }, { @@ -75763,27 +75763,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 832.119140625, - "completions/mean_terminated_length": 785.2596435546875, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1137.958984375, + "completions/mean_terminated_length": 1060.8369140625, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, "epoch": 0.8920372108901596, - "grad_norm": 2.148956060409546, - "kl": 8.0625, - "learning_rate": 1.3171119297837686e-07, - "loss": 0.4453, - "num_tokens": 1395578452.0, - "reward": 1.7353515625, - "reward_std": 0.5739935040473938, - "rewards/accuracy_reward/mean": 0.025390625, - "rewards/accuracy_reward/std": 0.15746226906776428, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.9072265625, - "rewards/tag_count_reward/std": 0.21258479356765747, + "grad_norm": 3.939441442489624, + "kl": 2.822265625, + "learning_rate": 1.317349707091638e-07, + "loss": 0.1316, + "num_tokens": 1497039709.0, + "reward": 1.03759765625, + "reward_std": 0.37869730591773987, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.2029850035905838, + "rewards/format_reward/mean": 0.091796875, + "rewards/format_reward/std": 0.289021372795105, + "rewards/tag_count_reward/mean": 0.90283203125, + "rewards/tag_count_reward/std": 0.21089871227741241, "step": 2613 }, { @@ -75792,27 +75792,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 846.14453125, - "completions/mean_terminated_length": 804.8687133789062, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1123.126953125, + "completions/mean_terminated_length": 1059.4091796875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.8923785952035505, - "grad_norm": 1.870068907737732, - "kl": 8.015625, - "learning_rate": 1.3151380285659565e-07, - "loss": 0.4641, - "num_tokens": 1396087358.0, - "reward": 1.77880859375, - "reward_std": 0.5826443433761597, - "rewards/accuracy_reward/mean": 0.05645161122083664, - "rewards/accuracy_reward/std": 0.23102475702762604, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.21162240207195282, + "grad_norm": 2.58689284324646, + "kl": 2.357421875, + "learning_rate": 1.315374343804092e-07, + "loss": 0.1303, + "num_tokens": 1497690430.0, + "reward": 1.12890625, + "reward_std": 0.42655646800994873, + "rewards/accuracy_reward/mean": 0.12298387289047241, + "rewards/accuracy_reward/std": 0.32875028252601624, + "rewards/format_reward/mean": 0.095703125, + "rewards/format_reward/std": 0.2944713830947876, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.1995285600423813, "step": 2614 }, { @@ -75821,27 +75821,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 778.541015625, - "completions/mean_terminated_length": 734.9434814453125, - "completions/min_length": 7.0, - "completions/min_terminated_length": 7.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1085.357421875, + "completions/mean_terminated_length": 1016.8848876953125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.8927199795169412, - "grad_norm": 1.4676388502120972, - "kl": 6.90625, - "learning_rate": 1.3131700669942907e-07, - "loss": 0.4164, - "num_tokens": 1396562211.0, - "reward": 1.8330078125, - "reward_std": 0.5347275733947754, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.2055833637714386, + "grad_norm": 4.555649280548096, + "kl": 2.8046875, + "learning_rate": 1.313404924334447e-07, + "loss": 0.176, + "num_tokens": 1498322373.0, + "reward": 1.06640625, + "reward_std": 0.37292221188545227, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.2275916188955307, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.20571577548980713, "step": 2615 }, { @@ -75850,27 +75850,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 929.205078125, - "completions/mean_terminated_length": 859.570556640625, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1178.947265625, + "completions/mean_terminated_length": 1097.241455078125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.893061363830332, - "grad_norm": 1.4115605354309082, - "kl": 6.88671875, - "learning_rate": 1.3112080478619333e-07, - "loss": 0.4301, - "num_tokens": 1397121068.0, - "reward": 1.77197265625, - "reward_std": 0.5459601879119873, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.81640625, - "rewards/format_reward/std": 0.3875311613082886, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.20380185544490814, + "grad_norm": 8.320698738098145, + "kl": 2.2265625, + "learning_rate": 1.3114414514800532e-07, + "loss": 0.1673, + "num_tokens": 1499009098.0, + "reward": 1.04150390625, + "reward_std": 0.37569621205329895, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.07421875, + "rewards/format_reward/std": 0.2623828947544098, + "rewards/tag_count_reward/mean": 0.90087890625, + "rewards/tag_count_reward/std": 0.20940276980400085, "step": 2616 }, { @@ -75879,27 +75879,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 818.673828125, - "completions/mean_terminated_length": 768.701171875, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1069.419921875, + "completions/mean_terminated_length": 1014.9423217773438, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.8934027481437228, - "grad_norm": 2.140897035598755, - "kl": 6.7109375, - "learning_rate": 1.309251973953612e-07, - "loss": 0.4308, - "num_tokens": 1397615061.0, - "reward": 1.8173828125, - "reward_std": 0.5434540510177612, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.19335831701755524, + "grad_norm": 1.2287455797195435, + "kl": 1.994140625, + "learning_rate": 1.3094839280298182e-07, + "loss": 0.082, + "num_tokens": 1499631473.0, + "reward": 1.13134765625, + "reward_std": 0.4147496521472931, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29176566004753113, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.20167149603366852, "step": 2617 }, { @@ -75908,27 +75908,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1965.0, - "completions/mean_length": 806.73046875, - "completions/mean_terminated_length": 766.6895141601562, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1022.142578125, + "completions/mean_terminated_length": 980.4410400390625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.8937441324571136, - "grad_norm": 1.4841294288635254, - "kl": 5.3203125, - "learning_rate": 1.3073018480456148e-07, - "loss": 0.3235, - "num_tokens": 1398107899.0, - "reward": 1.873046875, - "reward_std": 0.5673485994338989, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.19933690130710602, + "grad_norm": 1.7126773595809937, + "kl": 2.263671875, + "learning_rate": 1.3075323567641945e-07, + "loss": 0.1111, + "num_tokens": 1500234602.0, + "reward": 1.130859375, + "reward_std": 0.394452303647995, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.18587233126163483, "step": 2618 }, { @@ -75937,27 +75937,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1928.0, - "completions/mean_length": 863.703125, - "completions/mean_terminated_length": 813.0509643554688, - "completions/min_length": 87.0, - "completions/min_terminated_length": 87.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1160.025390625, + "completions/mean_terminated_length": 1072.3712158203125, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, "epoch": 0.8940855167705044, - "grad_norm": 2.4232535362243652, - "kl": 5.84375, - "learning_rate": 1.3053576729057902e-07, - "loss": 0.3975, - "num_tokens": 1398631731.0, - "reward": 1.80712890625, - "reward_std": 0.5566418766975403, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.21288292109966278, + "grad_norm": 9.404876708984375, + "kl": 2.462890625, + "learning_rate": 1.305586740455184e-07, + "loss": 0.1667, + "num_tokens": 1500910151.0, + "reward": 1.00146484375, + "reward_std": 0.35454005002975464, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.041015625, + "rewards/format_reward/std": 0.19852031767368317, + "rewards/tag_count_reward/mean": 0.89404296875, + "rewards/tag_count_reward/std": 0.22907254099845886, "step": 2619 }, { @@ -75966,27 +75966,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1956.0, - "completions/mean_length": 868.373046875, - "completions/mean_terminated_length": 825.3906860351562, - "completions/min_length": 29.0, - "completions/min_terminated_length": 29.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1145.693359375, + "completions/mean_terminated_length": 1069.2266845703125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, "epoch": 0.8944269010838952, - "grad_norm": 2.421449899673462, - "kl": 4.34765625, - "learning_rate": 1.3034194512935377e-07, - "loss": 0.2908, - "num_tokens": 1399159682.0, - "reward": 1.87890625, - "reward_std": 0.497196763753891, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.18438583612442017, + "grad_norm": 7.759430408477783, + "kl": 2.501953125, + "learning_rate": 1.3036470818663282e-07, + "loss": 0.11, + "num_tokens": 1501580090.0, + "reward": 1.1298828125, + "reward_std": 0.4416842758655548, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.091796875, + "rewards/format_reward/std": 0.289021372795105, + "rewards/tag_count_reward/mean": 0.8994140625, + "rewards/tag_count_reward/std": 0.2101617306470871, "step": 2620 }, { @@ -75995,27 +75995,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 814.171875, - "completions/mean_terminated_length": 771.7980346679688, - "completions/min_length": 74.0, - "completions/min_terminated_length": 74.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1070.552734375, + "completions/mean_terminated_length": 1018.2612915039062, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.894768285397286, - "grad_norm": 2.9884707927703857, - "kl": 5.0703125, - "learning_rate": 1.3014871859598092e-07, - "loss": 0.3511, - "num_tokens": 1399656250.0, - "reward": 1.81640625, - "reward_std": 0.5273551940917969, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.19821202754974365, + "grad_norm": 5.192171096801758, + "kl": 2.466796875, + "learning_rate": 1.3017133837527082e-07, + "loss": 0.1407, + "num_tokens": 1502207925.0, + "reward": 1.05322265625, + "reward_std": 0.3559301197528839, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.19488608837127686, "step": 2621 }, { @@ -76024,27 +76024,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 817.431640625, - "completions/mean_terminated_length": 780.291748046875, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1084.482421875, + "completions/mean_terminated_length": 1032.9361572265625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, "epoch": 0.8951096697106768, - "grad_norm": 3.455577850341797, - "kl": 4.4453125, - "learning_rate": 1.299560879647101e-07, - "loss": 0.3186, - "num_tokens": 1400158871.0, - "reward": 1.9052734375, - "reward_std": 0.48785555362701416, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.17928588390350342, + "grad_norm": 5.673861503601074, + "kl": 2.029296875, + "learning_rate": 1.299785648860936e-07, + "loss": 0.118, + "num_tokens": 1502847276.0, + "reward": 1.0810546875, + "reward_std": 0.36898016929626465, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.080078125, + "rewards/format_reward/std": 0.271679550409317, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.1962425857782364, "step": 2622 }, { @@ -76053,27 +76053,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 891.88671875, - "completions/mean_terminated_length": 852.181884765625, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1175.01953125, + "completions/mean_terminated_length": 1135.824462890625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, "epoch": 0.8954510540240675, - "grad_norm": 2.8744022846221924, - "kl": 4.96484375, - "learning_rate": 1.2976405350894536e-07, - "loss": 0.2987, - "num_tokens": 1400691645.0, - "reward": 1.83837890625, - "reward_std": 0.5029296278953552, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.18997004628181458, + "grad_norm": 2.5258841514587402, + "kl": 2.1328125, + "learning_rate": 1.2978638799291557e-07, + "loss": 0.0776, + "num_tokens": 1503525014.0, + "reward": 1.10498046875, + "reward_std": 0.42201095819473267, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.09765625, + "rewards/format_reward/std": 0.29713961482048035, + "rewards/tag_count_reward/mean": 0.91357421875, + "rewards/tag_count_reward/std": 0.19777707755565643, "step": 2623 }, { @@ -76082,27 +76082,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 862.896484375, - "completions/mean_terminated_length": 809.687744140625, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1058.30859375, + "completions/mean_terminated_length": 1001.0536499023438, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.8957924383374584, - "grad_norm": 1.500162959098816, - "kl": 6.421875, - "learning_rate": 1.295726155012445e-07, - "loss": 0.4093, - "num_tokens": 1401207592.0, - "reward": 1.81103515625, - "reward_std": 0.5551319122314453, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.20604471862316132, + "grad_norm": 2.5906670093536377, + "kl": 2.328125, + "learning_rate": 1.2959480796870362e-07, + "loss": 0.1335, + "num_tokens": 1504141012.0, + "reward": 1.078125, + "reward_std": 0.3664228320121765, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.08203125, + "rewards/format_reward/std": 0.2746807038784027, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.19119404256343842, "step": 2624 }, { @@ -76111,27 +76111,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1983.0, - "completions/mean_length": 809.501953125, - "completions/mean_terminated_length": 784.8306884765625, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1042.99609375, + "completions/mean_terminated_length": 964.7115478515625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, "epoch": 0.8961338226508492, - "grad_norm": 1.9419004917144775, - "kl": 4.03515625, - "learning_rate": 1.2938177421331875e-07, - "loss": 0.2319, - "num_tokens": 1401699017.0, - "reward": 1.904296875, - "reward_std": 0.4911288022994995, - "rewards/accuracy_reward/mean": 0.07459677755832672, - "rewards/accuracy_reward/std": 0.263004869222641, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.17051447927951813, + "grad_norm": 6.531774044036865, + "kl": 2.51953125, + "learning_rate": 1.2940382508557692e-07, + "loss": 0.1831, + "num_tokens": 1504751986.0, + "reward": 1.140625, + "reward_std": 0.41861116886138916, + "rewards/accuracy_reward/mean": 0.15120968222618103, + "rewards/accuracy_reward/std": 0.35861483216285706, + "rewards/format_reward/mean": 0.080078125, + "rewards/format_reward/std": 0.271679550409317, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.20793341100215912, "step": 2625 }, { @@ -76140,27 +76140,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 895.08203125, - "completions/mean_terminated_length": 835.8973388671875, - "completions/min_length": 101.0, - "completions/min_terminated_length": 101.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1118.3515625, + "completions/mean_terminated_length": 1037.4267578125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, "epoch": 0.89647520696424, - "grad_norm": 0.9074051976203918, - "kl": 7.3515625, - "learning_rate": 1.2919152991603235e-07, - "loss": 0.4617, - "num_tokens": 1402241091.0, - "reward": 1.83154296875, - "reward_std": 0.5970633625984192, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.203689306974411, + "grad_norm": 1.7829670906066895, + "kl": 3.08203125, + "learning_rate": 1.292134396148065e-07, + "loss": 0.1634, + "num_tokens": 1505408374.0, + "reward": 1.11328125, + "reward_std": 0.4727310538291931, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, + "rewards/format_reward/mean": 0.083984375, + "rewards/format_reward/std": 0.2776356339454651, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.22742362320423126, "step": 2626 }, { @@ -76169,27 +76169,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 853.10546875, - "completions/mean_terminated_length": 794.3401489257812, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1078.279296875, + "completions/mean_terminated_length": 989.3710327148438, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, "epoch": 0.8968165912776308, - "grad_norm": 2.1534411907196045, - "kl": 7.0859375, - "learning_rate": 1.2900188287940223e-07, - "loss": 0.4383, - "num_tokens": 1402752745.0, - "reward": 1.85888671875, - "reward_std": 0.5010073781013489, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.19918283820152283, + "grad_norm": 6.372350215911865, + "kl": 3.36328125, + "learning_rate": 1.2902365182681476e-07, + "loss": 0.178, + "num_tokens": 1506035317.0, + "reward": 1.06201171875, + "reward_std": 0.3921143114566803, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.89794921875, + "rewards/tag_count_reward/std": 0.21943418681621552, "step": 2627 }, { @@ -76198,27 +76198,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 884.89453125, - "completions/mean_terminated_length": 807.3541870117188, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1055.701171875, + "completions/mean_terminated_length": 1006.8995361328125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, "epoch": 0.8971579755910216, - "grad_norm": 2.211641311645508, - "kl": 8.9375, - "learning_rate": 1.2881283337259784e-07, - "loss": 0.551, - "num_tokens": 1403279379.0, - "reward": 1.81591796875, - "reward_std": 0.5615724921226501, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.90966796875, - "rewards/tag_count_reward/std": 0.2201211303472519, + "grad_norm": 5.5959367752075195, + "kl": 2.8515625, + "learning_rate": 1.2883446199117506e-07, + "loss": 0.1534, + "num_tokens": 1506649404.0, + "reward": 1.0791015625, + "reward_std": 0.377570241689682, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.9130859375, + "rewards/tag_count_reward/std": 0.20215243101119995, "step": 2628 }, { @@ -76227,27 +76227,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1759.0, - "completions/mean_length": 865.732421875, - "completions/mean_terminated_length": 792.1473388671875, - "completions/min_length": 84.0, - "completions/min_terminated_length": 84.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1062.7734375, + "completions/mean_terminated_length": 994.897705078125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.8974993599044124, - "grad_norm": 2.0325682163238525, - "kl": 7.65625, - "learning_rate": 1.2862438166394022e-07, - "loss": 0.4941, - "num_tokens": 1403793706.0, - "reward": 1.85595703125, - "reward_std": 0.5213677883148193, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.19921162724494934, + "grad_norm": 3.3806042671203613, + "kl": 2.58984375, + "learning_rate": 1.286458703766117e-07, + "loss": 0.1619, + "num_tokens": 1507264616.0, + "reward": 1.04833984375, + "reward_std": 0.355716347694397, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.06640625, + "rewards/format_reward/std": 0.2492343932390213, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.2048913985490799, "step": 2629 }, { @@ -76256,27 +76256,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 832.08984375, - "completions/mean_terminated_length": 780.0855712890625, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1039.947265625, + "completions/mean_terminated_length": 970.4989624023438, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.8978407442178032, - "grad_norm": 1.362596869468689, - "kl": 7.0, - "learning_rate": 1.284365280209022e-07, - "loss": 0.451, - "num_tokens": 1404303496.0, - "reward": 1.85205078125, - "reward_std": 0.530191957950592, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.20153403282165527, + "grad_norm": 2.282097578048706, + "kl": 3.45703125, + "learning_rate": 1.2845787725099897e-07, + "loss": 0.2178, + "num_tokens": 1507880829.0, + "reward": 1.07373046875, + "reward_std": 0.3778243064880371, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.072265625, + "rewards/format_reward/std": 0.2591804563999176, + "rewards/tag_count_reward/mean": 0.90185546875, + "rewards/tag_count_reward/std": 0.21617895364761353, "step": 2630 }, { @@ -76285,27 +76285,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 888.017578125, - "completions/mean_terminated_length": 830.96923828125, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1110.724609375, + "completions/mean_terminated_length": 1060.582275390625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, "epoch": 0.898182128531194, - "grad_norm": 1.9809017181396484, - "kl": 8.359375, - "learning_rate": 1.2824927271010777e-07, - "loss": 0.5198, - "num_tokens": 1404833553.0, - "reward": 1.8310546875, - "reward_std": 0.5532511472702026, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.19929614663124084, + "grad_norm": 2.463353395462036, + "kl": 2.1796875, + "learning_rate": 1.2827048288136126e-07, + "loss": 0.1131, + "num_tokens": 1508524912.0, + "reward": 1.14111328125, + "reward_std": 0.4089772403240204, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.095703125, + "rewards/format_reward/std": 0.2944713830947876, + "rewards/tag_count_reward/mean": 0.93017578125, + "rewards/tag_count_reward/std": 0.1820801943540573, "step": 2631 }, { @@ -76316,25 +76316,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1741.0, - "completions/mean_length": 855.234375, - "completions/mean_terminated_length": 814.270751953125, - "completions/min_length": 80.0, - "completions/min_terminated_length": 80.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1043.884765625, + "completions/mean_terminated_length": 1009.4000244140625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.8985235128445848, - "grad_norm": 2.1450774669647217, - "kl": 4.640625, - "learning_rate": 1.2806261599733127e-07, - "loss": 0.2875, - "num_tokens": 1405350697.0, - "reward": 1.876953125, - "reward_std": 0.49398133158683777, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.17051447927951813, + "grad_norm": 4.780738353729248, + "kl": 2.07421875, + "learning_rate": 1.2808368753387248e-07, + "loss": 0.0874, + "num_tokens": 1509138645.0, + "reward": 1.05615234375, + "reward_std": 0.3105568587779999, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.93896484375, + "rewards/tag_count_reward/std": 0.1664358228445053, "step": 2632 }, { @@ -76343,27 +76343,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1926.0, - "completions/mean_length": 832.533203125, - "completions/mean_terminated_length": 780.5479125976562, - "completions/min_length": 22.0, - "completions/min_terminated_length": 22.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1094.533203125, + "completions/mean_terminated_length": 1041.45361328125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, "epoch": 0.8988648971579756, - "grad_norm": 1.1332107782363892, - "kl": 6.3203125, - "learning_rate": 1.278765581474981e-07, - "loss": 0.3981, - "num_tokens": 1405855274.0, - "reward": 1.81982421875, - "reward_std": 0.5369333028793335, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.841796875, - "rewards/format_reward/std": 0.36528825759887695, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.20085012912750244, + "grad_norm": 5.698365688323975, + "kl": 2.8203125, + "learning_rate": 1.2789749147385562e-07, + "loss": 0.1062, + "num_tokens": 1509777366.0, + "reward": 1.11328125, + "reward_std": 0.41584059596061707, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29176566004753113, + "rewards/tag_count_reward/mean": 0.900390625, + "rewards/tag_count_reward/std": 0.20652242004871368, "step": 2633 }, { @@ -76372,27 +76372,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 847.32421875, - "completions/mean_terminated_length": 788.2745361328125, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1027.255859375, + "completions/mean_terminated_length": 985.7621459960938, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.8992062814713664, - "grad_norm": 1.9365277290344238, - "kl": 7.2734375, - "learning_rate": 1.276910994246831e-07, - "loss": 0.4488, - "num_tokens": 1406364304.0, - "reward": 1.8359375, - "reward_std": 0.5888235569000244, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.2085207849740982, + "grad_norm": 2.0969462394714355, + "kl": 2.55859375, + "learning_rate": 1.2771189496578248e-07, + "loss": 0.113, + "num_tokens": 1510378521.0, + "reward": 1.1416015625, + "reward_std": 0.46402841806411743, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.099609375, + "rewards/format_reward/std": 0.29977133870124817, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.18861530721187592, "step": 2634 }, { @@ -76401,27 +76401,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1982.0, - "completions/mean_length": 823.33984375, - "completions/mean_terminated_length": 760.4722900390625, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1066.6328125, + "completions/mean_terminated_length": 999.0230102539062, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.8995476657847572, - "grad_norm": 1.1484029293060303, - "kl": 8.203125, - "learning_rate": 1.275062400921112e-07, - "loss": 0.5561, - "num_tokens": 1406859966.0, - "reward": 1.83642578125, - "reward_std": 0.5626037120819092, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.91064453125, - "rewards/tag_count_reward/std": 0.2166028767824173, + "grad_norm": 5.536674976348877, + "kl": 3.5859375, + "learning_rate": 1.275268982732733e-07, + "loss": 0.1671, + "num_tokens": 1510998749.0, + "reward": 1.095703125, + "reward_std": 0.42561477422714233, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.095703125, + "rewards/format_reward/std": 0.2944713830947876, + "rewards/tag_count_reward/mean": 0.884765625, + "rewards/tag_count_reward/std": 0.22535543143749237, "step": 2635 }, { @@ -76430,27 +76430,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 911.73828125, - "completions/mean_terminated_length": 853.4086303710938, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1108.541015625, + "completions/mean_terminated_length": 1041.717529296875, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, "epoch": 0.899889050098148, - "grad_norm": 1.732434868812561, - "kl": 8.2890625, - "learning_rate": 1.273219804121562e-07, - "loss": 0.5103, - "num_tokens": 1407409512.0, - "reward": 1.74755859375, - "reward_std": 0.6244097948074341, - "rewards/accuracy_reward/mean": 0.052419353276491165, - "rewards/accuracy_reward/std": 0.22309619188308716, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.89404296875, - "rewards/tag_count_reward/std": 0.23642945289611816, + "grad_norm": 3.360011100769043, + "kl": 2.890625, + "learning_rate": 1.2734250165909624e-07, + "loss": 0.1511, + "num_tokens": 1511649058.0, + "reward": 1.103515625, + "reward_std": 0.4231613874435425, + "rewards/accuracy_reward/mean": 0.09677419066429138, + "rewards/accuracy_reward/std": 0.2959485352039337, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.30236753821372986, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.20895110070705414, "step": 2636 }, { @@ -76459,27 +76459,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 801.1015625, - "completions/mean_terminated_length": 760.8790283203125, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1032.490234375, + "completions/mean_terminated_length": 991.2092895507812, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, "epoch": 0.9002304344115388, - "grad_norm": 1.1615513563156128, - "kl": 6.4296875, - "learning_rate": 1.2713832064634125e-07, - "loss": 0.4347, - "num_tokens": 1407901244.0, - "reward": 1.85888671875, - "reward_std": 0.5226007103919983, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.19561529159545898, + "grad_norm": 5.832035064697266, + "kl": 2.755859375, + "learning_rate": 1.2715870538516713e-07, + "loss": 0.0993, + "num_tokens": 1512259261.0, + "reward": 1.12451171875, + "reward_std": 0.4120340943336487, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.09765625, + "rewards/format_reward/std": 0.29713961482048035, + "rewards/tag_count_reward/mean": 0.91943359375, + "rewards/tag_count_reward/std": 0.19085827469825745, "step": 2637 }, { @@ -76488,27 +76488,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1806.0, - "completions/mean_length": 892.8203125, - "completions/mean_terminated_length": 823.4617309570312, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1086.95703125, + "completions/mean_terminated_length": 1043.80810546875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.9005718187249296, - "grad_norm": 2.583146810531616, - "kl": 8.6328125, - "learning_rate": 1.2695526105533768e-07, - "loss": 0.5558, - "num_tokens": 1408437136.0, - "reward": 1.748046875, - "reward_std": 0.6300433874130249, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.892578125, - "rewards/tag_count_reward/std": 0.230254665017128, + "grad_norm": 3.8658218383789062, + "kl": 2.185546875, + "learning_rate": 1.269755097125492e-07, + "loss": 0.0796, + "num_tokens": 1512894551.0, + "reward": 1.10986328125, + "reward_std": 0.4367341995239258, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.30236753821372986, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.18640044331550598, "step": 2638 }, { @@ -76517,27 +76517,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 857.107421875, - "completions/mean_terminated_length": 793.3970947265625, - "completions/min_length": 3.0, - "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1143.248046875, + "completions/mean_terminated_length": 1080.91650390625, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, "epoch": 0.9009132030383203, - "grad_norm": 2.558863878250122, - "kl": 7.4765625, - "learning_rate": 1.26772801898965e-07, - "loss": 0.4564, - "num_tokens": 1408957511.0, - "reward": 1.79052734375, - "reward_std": 0.602996826171875, - "rewards/accuracy_reward/mean": 0.08669354766607285, - "rewards/accuracy_reward/std": 0.281669557094574, - "rewards/format_reward/mean": 0.80859375, - "rewards/format_reward/std": 0.3937928080558777, - "rewards/tag_count_reward/mean": 0.89794921875, - "rewards/tag_count_reward/std": 0.2249389886856079, + "grad_norm": 1.6574437618255615, + "kl": 3.41796875, + "learning_rate": 1.2679291490145267e-07, + "loss": 0.1824, + "num_tokens": 1513561430.0, + "reward": 1.134765625, + "reward_std": 0.44155943393707275, + "rewards/accuracy_reward/mean": 0.14717741310596466, + "rewards/accuracy_reward/std": 0.354640394449234, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29176566004753113, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.21770349144935608, "step": 2639 }, { @@ -76546,27 +76546,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 822.44921875, - "completions/mean_terminated_length": 775.217041015625, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1087.01953125, + "completions/mean_terminated_length": 1031.425537109375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.9012545873517112, - "grad_norm": 2.6348209381103516, - "kl": 5.765625, - "learning_rate": 1.2659094343619087e-07, - "loss": 0.3555, - "num_tokens": 1409460893.0, - "reward": 1.85498046875, - "reward_std": 0.5454111695289612, - "rewards/accuracy_reward/mean": 0.07056451588869095, - "rewards/accuracy_reward/std": 0.25635457038879395, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.1875656098127365, + "grad_norm": 5.300570487976074, + "kl": 2.517578125, + "learning_rate": 1.2661092121123387e-07, + "loss": 0.1722, + "num_tokens": 1514200272.0, + "reward": 1.09716796875, + "reward_std": 0.4066402316093445, + "rewards/accuracy_reward/mean": 0.1270161271095276, + "rewards/accuracy_reward/std": 0.3333272337913513, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.20725619792938232, "step": 2640 }, { @@ -76575,27 +76575,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 764.1015625, - "completions/mean_terminated_length": 743.7222900390625, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1002.271484375, + "completions/mean_terminated_length": 959.7621459960938, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.901595971665102, - "grad_norm": 2.027493953704834, - "kl": 4.8671875, - "learning_rate": 1.2640968592512978e-07, - "loss": 0.2933, - "num_tokens": 1409927345.0, - "reward": 1.89453125, - "reward_std": 0.48001429438591003, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.17556172609329224, + "grad_norm": 2.8814470767974854, + "kl": 2.66015625, + "learning_rate": 1.2642952890039577e-07, + "loss": 0.1415, + "num_tokens": 1514788667.0, + "reward": 1.12548828125, + "reward_std": 0.3859245777130127, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.92236328125, + "rewards/tag_count_reward/std": 0.18493369221687317, "step": 2641 }, { @@ -76604,27 +76604,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 898.37109375, - "completions/mean_terminated_length": 841.8319091796875, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1075.18359375, + "completions/mean_terminated_length": 1029.4273681640625, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, "epoch": 0.9019373559784928, - "grad_norm": 2.3642499446868896, - "kl": 5.40625, - "learning_rate": 1.2622902962304394e-07, - "loss": 0.3667, - "num_tokens": 1410466095.0, - "reward": 1.8291015625, - "reward_std": 0.5632820725440979, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.20294499397277832, + "grad_norm": 2.086258888244629, + "kl": 1.462890625, + "learning_rate": 1.262487382265868e-07, + "loss": 0.0566, + "num_tokens": 1515417945.0, + "reward": 1.107421875, + "reward_std": 0.3736908435821533, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.15718287229537964, "step": 2642 }, { @@ -76633,27 +76633,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 772.806640625, - "completions/mean_terminated_length": 742.2020263671875, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1031.48828125, + "completions/mean_terminated_length": 968.219970703125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.9022787402918836, - "grad_norm": 2.121507406234741, - "kl": 3.2578125, - "learning_rate": 1.2604897478634172e-07, - "loss": 0.2356, - "num_tokens": 1410945996.0, - "reward": 1.92431640625, - "reward_std": 0.4335697591304779, - "rewards/accuracy_reward/mean": 0.07459677755832672, - "rewards/accuracy_reward/std": 0.263004869222641, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.94775390625, - "rewards/tag_count_reward/std": 0.16722622513771057, + "grad_norm": 4.667018890380859, + "kl": 2.376953125, + "learning_rate": 1.2606854944660113e-07, + "loss": 0.122, + "num_tokens": 1516030291.0, + "reward": 1.11328125, + "reward_std": 0.44415283203125, + "rewards/accuracy_reward/mean": 0.14516128599643707, + "rewards/accuracy_reward/std": 0.3526190221309662, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.904296875, + "rewards/tag_count_reward/std": 0.20777709782123566, "step": 2643 }, { @@ -76662,27 +76662,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 883.7265625, - "completions/mean_terminated_length": 826.4671630859375, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1126.814453125, + "completions/mean_terminated_length": 1073.522705078125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, "epoch": 0.9026201246052744, - "grad_norm": 2.303095817565918, - "kl": 5.3984375, - "learning_rate": 1.2586952167057805e-07, - "loss": 0.3467, - "num_tokens": 1411479440.0, - "reward": 1.833984375, - "reward_std": 0.5213505029678345, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.19337067008018494, + "grad_norm": 5.649810791015625, + "kl": 2.70703125, + "learning_rate": 1.2588896281637765e-07, + "loss": 0.1474, + "num_tokens": 1516688196.0, + "reward": 1.07763671875, + "reward_std": 0.42246103286743164, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.12109375, + "rewards/format_reward/std": 0.3265552520751953, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.19911566376686096, "step": 2644 }, { @@ -76691,27 +76691,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 750.623046875, - "completions/mean_terminated_length": 700.6226806640625, - "completions/min_length": 76.0, - "completions/min_terminated_length": 76.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 968.8359375, + "completions/mean_terminated_length": 929.5142211914062, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, "epoch": 0.9029615089186652, - "grad_norm": 4.877878665924072, - "kl": 5.08984375, - "learning_rate": 1.256906705304539e-07, - "loss": 0.3845, - "num_tokens": 1411937887.0, - "reward": 1.89990234375, - "reward_std": 0.4928579032421112, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.17986257374286652, + "grad_norm": 4.944515228271484, + "kl": 2.92578125, + "learning_rate": 1.2570997859100044e-07, + "loss": 0.1965, + "num_tokens": 1517258368.0, + "reward": 1.1064453125, + "reward_std": 0.3886204957962036, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.068359375, + "rewards/format_reward/std": 0.25260838866233826, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.18374989926815033, "step": 2645 }, { @@ -76720,27 +76720,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2031.0, - "completions/mean_length": 807.642578125, - "completions/mean_terminated_length": 767.6310424804688, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1052.685546875, + "completions/mean_terminated_length": 1012.2255859375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, "epoch": 0.903302893232056, - "grad_norm": 2.0590627193450928, - "kl": 4.33984375, - "learning_rate": 1.2551242161981563e-07, - "loss": 0.2852, - "num_tokens": 1412434776.0, - "reward": 1.8896484375, - "reward_std": 0.4454382061958313, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.17514485120773315, + "grad_norm": 1.7116081714630127, + "kl": 1.90625, + "learning_rate": 1.2553159702469743e-07, + "loss": 0.0879, + "num_tokens": 1517880719.0, + "reward": 1.0712890625, + "reward_std": 0.3728310167789459, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.07421875, + "rewards/format_reward/std": 0.2623828947544098, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.1848077028989792, "step": 2646 }, { @@ -76749,27 +76749,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 806.576171875, - "completions/mean_terminated_length": 766.5302124023438, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1052.8671875, + "completions/mean_terminated_length": 1014.5151977539062, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.9036442775454467, - "grad_norm": 3.0107901096343994, - "kl": 5.796875, - "learning_rate": 1.253347751916551e-07, - "loss": 0.407, - "num_tokens": 1412924255.0, - "reward": 1.87255859375, - "reward_std": 0.5324903130531311, - "rewards/accuracy_reward/mean": 0.07056451588869095, - "rewards/accuracy_reward/std": 0.25635457038879395, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19710436463356018, + "grad_norm": 1.2536317110061646, + "kl": 2.392578125, + "learning_rate": 1.2535381837084092e-07, + "loss": 0.0877, + "num_tokens": 1518496299.0, + "reward": 1.1474609375, + "reward_std": 0.4637940526008606, + "rewards/accuracy_reward/mean": 0.1270161271095276, + "rewards/accuracy_reward/std": 0.33332720398902893, + "rewards/format_reward/mean": 0.103515625, + "rewards/format_reward/std": 0.30492907762527466, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.1879250556230545, "step": 2647 }, { @@ -76778,27 +76778,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, - "completions/mean_length": 796.70703125, - "completions/mean_terminated_length": 764.1082153320312, - "completions/min_length": 66.0, - "completions/min_terminated_length": 66.0, + "completions/mean_length": 1041.8671875, + "completions/mean_terminated_length": 1011.5009765625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, "epoch": 0.9039856618588376, - "grad_norm": 1.3113056421279907, - "kl": 4.96875, - "learning_rate": 1.2515773149810875e-07, - "loss": 0.3184, - "num_tokens": 1413414345.0, - "reward": 1.91650390625, - "reward_std": 0.3993160128593445, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.95751953125, - "rewards/tag_count_reward/std": 0.15172556042671204, + "grad_norm": 4.721194744110107, + "kl": 1.619140625, + "learning_rate": 1.251766428819465e-07, + "loss": 0.0775, + "num_tokens": 1519111911.0, + "reward": 1.13916015625, + "reward_std": 0.40613314509391785, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.099609375, + "rewards/format_reward/std": 0.29977133870124817, + "rewards/tag_count_reward/mean": 0.94189453125, + "rewards/tag_count_reward/std": 0.1615353375673294, "step": 2648 }, { @@ -76807,27 +76807,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1917.0, - "completions/mean_length": 811.69140625, - "completions/mean_terminated_length": 774.3782348632812, - "completions/min_length": 52.0, - "completions/min_terminated_length": 52.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1093.201171875, + "completions/mean_terminated_length": 1037.96484375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, "epoch": 0.9043270461722284, - "grad_norm": 1.8289512395858765, - "kl": 5.078125, - "learning_rate": 1.24981290790458e-07, - "loss": 0.3301, - "num_tokens": 1413912075.0, - "reward": 1.88134765625, - "reward_std": 0.45731788873672485, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.95166015625, - "rewards/tag_count_reward/std": 0.15477456152439117, + "grad_norm": 2.5431466102600098, + "kl": 2.1025390625, + "learning_rate": 1.2500007080967335e-07, + "loss": 0.0873, + "num_tokens": 1519753774.0, + "reward": 1.05712890625, + "reward_std": 0.38261714577674866, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.08984375, + "rewards/format_reward/std": 0.2862374484539032, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.19531220197677612, "step": 2649 }, { @@ -76836,27 +76836,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1893.0, - "completions/mean_length": 771.2734375, - "completions/mean_terminated_length": 735.3814697265625, - "completions/min_length": 124.0, - "completions/min_terminated_length": 124.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 987.685546875, + "completions/mean_terminated_length": 977.2288208007812, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.9046684304856192, - "grad_norm": 1.23770272731781, - "kl": 4.83984375, - "learning_rate": 1.2480545331912786e-07, - "loss": 0.3244, - "num_tokens": 1414379975.0, - "reward": 1.92578125, - "reward_std": 0.3980555534362793, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.953125, - "rewards/tag_count_reward/std": 0.16102655231952667, + "grad_norm": 8.41772747039795, + "kl": 1.630859375, + "learning_rate": 1.2482410240482323e-07, + "loss": 0.0244, + "num_tokens": 1520332477.0, + "reward": 1.17626953125, + "reward_std": 0.4130568504333496, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.099609375, + "rewards/format_reward/std": 0.29977133870124817, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.16680285334587097, "step": 2650 }, { @@ -76865,27 +76865,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1918.0, - "completions/mean_length": 755.50390625, - "completions/mean_terminated_length": 708.408935546875, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1002.05078125, + "completions/mean_terminated_length": 943.8226928710938, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.90500981479901, - "grad_norm": 1.767143964767456, - "kl": 6.984375, - "learning_rate": 1.246302193336876e-07, - "loss": 0.4542, - "num_tokens": 1414838185.0, - "reward": 1.9462890625, - "reward_std": 0.48878249526023865, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.17653599381446838, + "grad_norm": 2.3477649688720703, + "kl": 2.3359375, + "learning_rate": 1.2464873791734088e-07, + "loss": 0.1176, + "num_tokens": 1520916919.0, + "reward": 1.1591796875, + "reward_std": 0.3943057656288147, + "rewards/accuracy_reward/mean": 0.177734375, + "rewards/accuracy_reward/std": 0.3826628625392914, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.9111328125, + "rewards/tag_count_reward/std": 0.19885456562042236, "step": 2651 }, { @@ -76894,27 +76894,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1946.0, - "completions/mean_length": 779.169921875, - "completions/mean_terminated_length": 727.5914306640625, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1065.58984375, + "completions/mean_terminated_length": 986.8311767578125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.9053511991124008, - "grad_norm": 2.6416122913360596, - "kl": 7.8125, - "learning_rate": 1.2445558908284983e-07, - "loss": 0.454, - "num_tokens": 1415314688.0, - "reward": 1.86669921875, - "reward_std": 0.5072627663612366, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.1922997236251831, + "grad_norm": 2.826422929763794, + "kl": 3.2109375, + "learning_rate": 1.244739775963128e-07, + "loss": 0.1928, + "num_tokens": 1521540069.0, + "reward": 1.10400390625, + "reward_std": 0.40402260422706604, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.10546875, + "rewards/format_reward/std": 0.3074568510055542, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.21408694982528687, "step": 2652 }, { @@ -76923,27 +76923,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1875.0, - "completions/mean_length": 799.181640625, - "completions/mean_terminated_length": 753.6781616210938, - "completions/min_length": 93.0, - "completions/min_terminated_length": 93.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1101.953125, + "completions/mean_terminated_length": 1069.462646484375, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, "epoch": 0.9056925834257916, - "grad_norm": 3.158607006072998, - "kl": 8.171875, - "learning_rate": 1.2428156281447017e-07, - "loss": 0.4813, - "num_tokens": 1415800573.0, - "reward": 1.8623046875, - "reward_std": 0.4951796531677246, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.18930304050445557, + "grad_norm": 4.190649032592773, + "kl": 1.921875, + "learning_rate": 1.242998216899677e-07, + "loss": 0.0674, + "num_tokens": 1522180973.0, + "reward": 1.103515625, + "reward_std": 0.41256994009017944, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.31241437792778015, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.1787441074848175, "step": 2653 }, { @@ -76952,27 +76952,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 766.255859375, - "completions/mean_terminated_length": 730.2228393554688, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1007.587890625, + "completions/mean_terminated_length": 967.4908447265625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.9060339677391824, - "grad_norm": 2.059149980545044, - "kl": 7.5859375, - "learning_rate": 1.2410814077554717e-07, - "loss": 0.4717, - "num_tokens": 1416260048.0, - "reward": 1.9111328125, - "reward_std": 0.5058963894844055, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.17932851612567902, + "grad_norm": 1.3401790857315063, + "kl": 2.173828125, + "learning_rate": 1.2412627044567542e-07, + "loss": 0.1253, + "num_tokens": 1522764010.0, + "reward": 1.1474609375, + "reward_std": 0.41852399706840515, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.19115155935287476, "step": 2654 }, { @@ -76981,27 +76981,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 851.5234375, - "completions/mean_terminated_length": 779.685302734375, - "completions/min_length": 81.0, - "completions/min_terminated_length": 81.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1138.1640625, + "completions/mean_terminated_length": 1035.31298828125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.9063753520525731, - "grad_norm": 4.332850933074951, - "kl": 9.296875, - "learning_rate": 1.239353232122216e-07, - "loss": 0.5657, - "num_tokens": 1416776604.0, - "reward": 1.81494140625, - "reward_std": 0.5418930053710938, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.91455078125, - "rewards/tag_count_reward/std": 0.21018162369728088, + "grad_norm": 4.066878795623779, + "kl": 2.392578125, + "learning_rate": 1.2395332410994732e-07, + "loss": 0.1628, + "num_tokens": 1523427326.0, + "reward": 1.041015625, + "reward_std": 0.40300118923187256, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.888671875, + "rewards/tag_count_reward/std": 0.22838793694972992, "step": 2655 }, { @@ -77010,27 +77010,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1871.0, - "completions/mean_length": 754.818359375, - "completions/mean_terminated_length": 718.4638061523438, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1004.39453125, + "completions/mean_terminated_length": 948.5637817382812, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.906716736365964, - "grad_norm": 1.4998120069503784, - "kl": 5.84765625, - "learning_rate": 1.2376311036977652e-07, - "loss": 0.3438, - "num_tokens": 1417230927.0, - "reward": 1.97900390625, - "reward_std": 0.462978720664978, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, - "rewards/format_reward/mean": 0.908203125, - "rewards/format_reward/std": 0.289021372795105, - "rewards/tag_count_reward/mean": 0.95166015625, - "rewards/tag_count_reward/std": 0.15944552421569824, + "grad_norm": 3.056931495666504, + "kl": 2.033203125, + "learning_rate": 1.237809829284352e-07, + "loss": 0.1233, + "num_tokens": 1524009432.0, + "reward": 1.1435546875, + "reward_std": 0.36671286821365356, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2422981858253479, + "rewards/tag_count_reward/mean": 0.9208984375, + "rewards/tag_count_reward/std": 0.18464216589927673, "step": 2656 }, { @@ -77039,27 +77039,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 797.23828125, - "completions/mean_terminated_length": 756.89111328125, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1082.41796875, + "completions/mean_terminated_length": 1047.23486328125, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, "epoch": 0.9070581206793548, - "grad_norm": 1.8390028476715088, - "kl": 7.2734375, - "learning_rate": 1.2359150249263649e-07, - "loss": 0.4555, - "num_tokens": 1417718345.0, - "reward": 1.8720703125, - "reward_std": 0.4648139178752899, - "rewards/accuracy_reward/mean": 0.0463709682226181, - "rewards/accuracy_reward/std": 0.21049949526786804, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.18068745732307434, + "grad_norm": 3.420994520187378, + "kl": 1.841796875, + "learning_rate": 1.2360924714593165e-07, + "loss": 0.0808, + "num_tokens": 1524642862.0, + "reward": 1.146484375, + "reward_std": 0.4280458688735962, + "rewards/accuracy_reward/mean": 0.1270161271095276, + "rewards/accuracy_reward/std": 0.33332720398902893, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.30236753821372986, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.1830647885799408, "step": 2657 }, { @@ -77068,27 +77068,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.09765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1830.0, - "completions/mean_length": 828.07421875, - "completions/mean_terminated_length": 775.898193359375, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1168.869140625, + "completions/mean_terminated_length": 1073.72509765625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, "epoch": 0.9073995049927456, - "grad_norm": 0.9786667227745056, - "kl": 6.890625, - "learning_rate": 1.2342049982436734e-07, - "loss": 0.4399, - "num_tokens": 1418218799.0, - "reward": 1.9091796875, - "reward_std": 0.5116897821426392, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.18272781372070312, + "grad_norm": 3.849257230758667, + "kl": 3.12890625, + "learning_rate": 1.2343811700636902e-07, + "loss": 0.1874, + "num_tokens": 1525317803.0, + "reward": 1.095703125, + "reward_std": 0.408250629901886, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.888671875, + "rewards/tag_count_reward/std": 0.21343934535980225, "step": 2658 }, { @@ -77097,27 +77097,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 805.41796875, - "completions/mean_terminated_length": 780.6653442382812, - "completions/min_length": 18.0, - "completions/min_terminated_length": 18.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1083.3203125, + "completions/mean_terminated_length": 1046.1419677734375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, "epoch": 0.9077408893061364, - "grad_norm": 1.0915343761444092, - "kl": 4.51953125, - "learning_rate": 1.2325010260767639e-07, - "loss": 0.268, - "num_tokens": 1418707653.0, - "reward": 1.94580078125, - "reward_std": 0.44406041502952576, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.90625, - "rewards/format_reward/std": 0.29176566004753113, - "rewards/tag_count_reward/mean": 0.95751953125, - "rewards/tag_count_reward/std": 0.1476399451494217, + "grad_norm": 2.9929521083831787, + "kl": 1.86328125, + "learning_rate": 1.2326759275281966e-07, + "loss": 0.1114, + "num_tokens": 1525948943.0, + "reward": 1.146484375, + "reward_std": 0.3887425661087036, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.26863065361976624, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.18505829572677612, "step": 2659 }, { @@ -77126,27 +77126,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 699.26953125, - "completions/mean_terminated_length": 669.6566772460938, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_terminated_length": 1934.0, + "completions/mean_length": 938.794921875, + "completions/mean_terminated_length": 907.6124267578125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.9080822736195272, - "grad_norm": 1.5793782472610474, - "kl": 5.578125, - "learning_rate": 1.2308031108441105e-07, - "loss": 0.3933, - "num_tokens": 1419137279.0, - "reward": 1.92626953125, - "reward_std": 0.4270179867744446, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.9140625, - "rewards/format_reward/std": 0.28054583072662354, - "rewards/tag_count_reward/mean": 0.95556640625, - "rewards/tag_count_reward/std": 0.15750399231910706, + "grad_norm": 1.695146083831787, + "kl": 2.24609375, + "learning_rate": 1.2309767462749515e-07, + "loss": 0.1372, + "num_tokens": 1526501206.0, + "reward": 1.06884765625, + "reward_std": 0.32772374153137207, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.064453125, + "rewards/format_reward/std": 0.24579854309558868, + "rewards/tag_count_reward/mean": 0.93603515625, + "rewards/tag_count_reward/std": 0.17042970657348633, "step": 2660 }, { @@ -77155,27 +77155,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 735.955078125, - "completions/mean_terminated_length": 701.7735595703125, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1036.87109375, + "completions/mean_terminated_length": 987.1433715820312, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.908423657932918, - "grad_norm": 1.0845015048980713, - "kl": 5.390625, - "learning_rate": 1.2291112549555952e-07, - "loss": 0.3535, - "num_tokens": 1419586680.0, - "reward": 1.875, - "reward_std": 0.44786369800567627, - "rewards/accuracy_reward/mean": 0.029296875, - "rewards/accuracy_reward/std": 0.16880230605602264, - "rewards/format_reward/mean": 0.900390625, - "rewards/format_reward/std": 0.29977133870124817, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.1808803677558899, + "grad_norm": 7.301959991455078, + "kl": 2.72265625, + "learning_rate": 1.2292836287174631e-07, + "loss": 0.1366, + "num_tokens": 1527104676.0, + "reward": 1.0634765625, + "reward_std": 0.3880910277366638, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.2029850035905838, + "rewards/format_reward/mean": 0.103515625, + "rewards/format_reward/std": 0.30492907762527466, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.19077126681804657, "step": 2661 }, { @@ -77184,27 +77184,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1903.0, - "completions/mean_length": 776.365234375, - "completions/mean_terminated_length": 727.3569946289062, - "completions/min_length": 21.0, - "completions/min_terminated_length": 21.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 1045.271484375, + "completions/mean_terminated_length": 993.7967529296875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.9087650422463088, - "grad_norm": 1.8664003610610962, - "kl": 6.6484375, - "learning_rate": 1.2274254608124973e-07, - "loss": 0.4734, - "num_tokens": 1420062931.0, - "reward": 1.93017578125, - "reward_std": 0.509323239326477, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.93798828125, - "rewards/tag_count_reward/std": 0.18289703130722046, + "grad_norm": 1.512845754623413, + "kl": 2.232421875, + "learning_rate": 1.2275965772606242e-07, + "loss": 0.1188, + "num_tokens": 1527718607.0, + "reward": 1.15673828125, + "reward_std": 0.42630091309547424, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.103515625, + "rewards/format_reward/std": 0.30492907762527466, + "rewards/tag_count_reward/mean": 0.91455078125, + "rewards/tag_count_reward/std": 0.18610802292823792, "step": 2662 }, { @@ -77213,27 +77213,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1828.0, - "completions/mean_length": 786.041015625, - "completions/mean_terminated_length": 742.7010498046875, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1136.9296875, + "completions/mean_terminated_length": 1070.07958984375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.9091064265596995, - "grad_norm": 2.83722186088562, - "kl": 6.0234375, - "learning_rate": 1.2257457308074925e-07, - "loss": 0.4302, - "num_tokens": 1420544600.0, - "reward": 1.890625, - "reward_std": 0.44958245754241943, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.90234375, - "rewards/format_reward/std": 0.29713961482048035, - "rewards/tag_count_reward/mean": 0.947265625, - "rewards/tag_count_reward/std": 0.16743822395801544, + "grad_norm": 2.511873483657837, + "kl": 2.28515625, + "learning_rate": 1.2259155943007143e-07, + "loss": 0.1329, + "num_tokens": 1528379931.0, + "reward": 1.07275390625, + "reward_std": 0.40728020668029785, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29176566004753113, + "rewards/tag_count_reward/mean": 0.90673828125, + "rewards/tag_count_reward/std": 0.1996045708656311, "step": 2663 }, { @@ -77242,27 +77242,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1844.0, - "completions/mean_length": 687.99609375, - "completions/mean_terminated_length": 652.5651245117188, - "completions/min_length": 62.0, - "completions/min_terminated_length": 62.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 920.001953125, + "completions/mean_terminated_length": 897.5319213867188, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, "epoch": 0.9094478108730903, - "grad_norm": 1.1657016277313232, - "kl": 6.20703125, - "learning_rate": 1.2240720673246515e-07, - "loss": 0.4059, - "num_tokens": 1420968918.0, - "reward": 1.947265625, - "reward_std": 0.5019833445549011, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.17615941166877747, + "grad_norm": 2.4876606464385986, + "kl": 2.080078125, + "learning_rate": 1.2242406822253908e-07, + "loss": 0.1094, + "num_tokens": 1528923036.0, + "reward": 1.1728515625, + "reward_std": 0.4149667024612427, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, + "rewards/format_reward/mean": 0.080078125, + "rewards/format_reward/std": 0.271679550409317, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.1784525215625763, "step": 2664 }, { @@ -77271,27 +77271,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1950.0, - "completions/mean_length": 758.087890625, - "completions/mean_terminated_length": 724.4829711914062, - "completions/min_length": 87.0, - "completions/min_terminated_length": 87.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1119.171875, + "completions/mean_terminated_length": 1038.3184814453125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.9097891951864812, - "grad_norm": 0.8518578410148621, - "kl": 7.3046875, - "learning_rate": 1.2224044727394326e-07, - "loss": 0.4586, - "num_tokens": 1421440899.0, - "reward": 1.833984375, - "reward_std": 0.4974585175514221, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.18033012747764587, + "grad_norm": 3.452432632446289, + "kl": 3.208984375, + "learning_rate": 1.22257184341369e-07, + "loss": 0.1466, + "num_tokens": 1529579892.0, + "reward": 1.046875, + "reward_std": 0.4089784622192383, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.091796875, + "rewards/format_reward/std": 0.289021372795105, + "rewards/tag_count_reward/mean": 0.884765625, + "rewards/tag_count_reward/std": 0.22643831372261047, "step": 2665 }, { @@ -77303,24 +77303,24 @@ "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, - "completions/mean_length": 810.29296875, - "completions/mean_terminated_length": 762.59228515625, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/mean_length": 1078.55078125, + "completions/mean_terminated_length": 1041.1885986328125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.910130579499872, - "grad_norm": 1.1885873079299927, - "kl": 7.0, - "learning_rate": 1.2207429494186826e-07, - "loss": 0.4449, - "num_tokens": 1421930281.0, - "reward": 1.880859375, - "reward_std": 0.45438534021377563, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.16899466514587402, + "grad_norm": 2.5023417472839355, + "kl": 2.568359375, + "learning_rate": 1.2209090802360198e-07, + "loss": 0.1428, + "num_tokens": 1530206622.0, + "reward": 1.07861328125, + "reward_std": 0.38993924856185913, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29176566004753113, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.18574312329292297, "step": 2666 }, { @@ -77329,27 +77329,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 769.58984375, - "completions/mean_terminated_length": 720.3204956054688, - "completions/min_length": 63.0, - "completions/min_terminated_length": 63.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1015.810546875, + "completions/mean_terminated_length": 960.5905151367188, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, "epoch": 0.9104719638132628, - "grad_norm": 2.0983266830444336, - "kl": 9.25, - "learning_rate": 1.219087499720628e-07, - "loss": 0.5734, - "num_tokens": 1422396791.0, - "reward": 1.86279296875, - "reward_std": 0.5810860395431519, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.20802931487560272, + "grad_norm": 1.8152896165847778, + "kl": 2.134765625, + "learning_rate": 1.21925239505416e-07, + "loss": 0.0948, + "num_tokens": 1530799197.0, + "reward": 1.1875, + "reward_std": 0.45548373460769653, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.30236753821372986, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.18587233126163483, "step": 2667 }, { @@ -77358,27 +77358,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1937.0, - "completions/mean_length": 781.3515625, - "completions/mean_terminated_length": 737.8505249023438, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1110.693359375, + "completions/mean_terminated_length": 1039.8046875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.9108133481266536, - "grad_norm": 2.4403631687164307, - "kl": 7.25, - "learning_rate": 1.2174381259948785e-07, - "loss": 0.5055, - "num_tokens": 1422879531.0, - "reward": 1.85498046875, - "reward_std": 0.49316757917404175, - "rewards/accuracy_reward/mean": 0.038306452333927155, - "rewards/accuracy_reward/std": 0.19212883710861206, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.19169752299785614, + "grad_norm": 1.882718563079834, + "kl": 2.916015625, + "learning_rate": 1.217601790221257e-07, + "loss": 0.1425, + "num_tokens": 1531450560.0, + "reward": 1.0771484375, + "reward_std": 0.43762797117233276, + "rewards/accuracy_reward/mean": 0.07459677755832672, + "rewards/accuracy_reward/std": 0.263004869222641, + "rewards/format_reward/mean": 0.111328125, + "rewards/format_reward/std": 0.31484565138816833, + "rewards/tag_count_reward/mean": 0.8935546875, + "rewards/tag_count_reward/std": 0.21134056150913239, "step": 2668 }, { @@ -77387,27 +77387,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.103515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1948.0, - "completions/mean_length": 795.333984375, - "completions/mean_terminated_length": 747.0567626953125, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1175.927734375, + "completions/mean_terminated_length": 1075.23095703125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, "epoch": 0.9111547324400444, - "grad_norm": 3.1153383255004883, - "kl": 7.96875, - "learning_rate": 1.2157948305824184e-07, - "loss": 0.5038, - "num_tokens": 1423376502.0, - "reward": 1.81298828125, - "reward_std": 0.5767254829406738, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.20627647638320923, + "grad_norm": 3.359076976776123, + "kl": 2.97265625, + "learning_rate": 1.2159572680818183e-07, + "loss": 0.1653, + "num_tokens": 1532142395.0, + "reward": 1.08984375, + "reward_std": 0.47652333974838257, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.1171875, + "rewards/format_reward/std": 0.32195815443992615, + "rewards/tag_count_reward/mean": 0.880859375, + "rewards/tag_count_reward/std": 0.22549107670783997, "step": 2669 }, { @@ -77416,27 +77416,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1958.0, - "completions/mean_length": 791.0546875, - "completions/mean_terminated_length": 734.620361328125, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1115.564453125, + "completions/mean_terminated_length": 1040.8121337890625, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, "epoch": 0.9114961167534352, - "grad_norm": 2.4565532207489014, - "kl": 8.328125, - "learning_rate": 1.2141576158156031e-07, - "loss": 0.5237, - "num_tokens": 1423860226.0, - "reward": 1.8203125, - "reward_std": 0.5129342675209045, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.2099001258611679, + "grad_norm": 1.5530048608779907, + "kl": 2.1484375, + "learning_rate": 1.214318830971716e-07, + "loss": 0.0883, + "num_tokens": 1532792268.0, + "reward": 1.08056640625, + "reward_std": 0.4171370267868042, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.091796875, + "rewards/format_reward/std": 0.289021372795105, + "rewards/tag_count_reward/mean": 0.89501953125, + "rewards/tag_count_reward/std": 0.22083191573619843, "step": 2670 }, { @@ -77445,27 +77445,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1884.0, - "completions/mean_length": 758.23046875, - "completions/mean_terminated_length": 711.23486328125, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1140.28515625, + "completions/mean_terminated_length": 1059.170166015625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, "epoch": 0.9118375010668259, - "grad_norm": 1.8298970460891724, - "kl": 6.8984375, - "learning_rate": 1.2125264840181623e-07, - "loss": 0.4461, - "num_tokens": 1424323720.0, - "reward": 1.8623046875, - "reward_std": 0.4886665940284729, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9306640625, - "rewards/tag_count_reward/std": 0.1878843754529953, + "grad_norm": 3.5011441707611084, + "kl": 2.86328125, + "learning_rate": 1.2126864812181754e-07, + "loss": 0.128, + "num_tokens": 1533451374.0, + "reward": 1.07763671875, + "reward_std": 0.4584527909755707, + "rewards/accuracy_reward/mean": 0.08064515888690948, + "rewards/accuracy_reward/std": 0.2725643217563629, + "rewards/format_reward/mean": 0.11328125, + "rewards/format_reward/std": 0.3172462284564972, + "rewards/tag_count_reward/mean": 0.88623046875, + "rewards/tag_count_reward/std": 0.22744829952716827, "step": 2671 }, { @@ -77474,27 +77474,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 767.19140625, - "completions/mean_terminated_length": 736.4520263671875, - "completions/min_length": 63.0, - "completions/min_terminated_length": 63.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1096.646484375, + "completions/mean_terminated_length": 1041.6094970703125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, "epoch": 0.9121788853802167, - "grad_norm": 1.485329508781433, - "kl": 5.9375, - "learning_rate": 1.2109014375051868e-07, - "loss": 0.3911, - "num_tokens": 1424790618.0, - "reward": 1.88623046875, - "reward_std": 0.4452371597290039, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.94873046875, - "rewards/tag_count_reward/std": 0.16679713129997253, + "grad_norm": 2.4744904041290283, + "kl": 2.205078125, + "learning_rate": 1.2110602211397773e-07, + "loss": 0.0815, + "num_tokens": 1534086953.0, + "reward": 1.103515625, + "reward_std": 0.44481056928634644, + "rewards/accuracy_reward/mean": 0.08467742055654526, + "rewards/accuracy_reward/std": 0.278682142496109, + "rewards/format_reward/mean": 0.107421875, + "rewards/format_reward/std": 0.30995169281959534, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.1901114583015442, "step": 2672 }, { @@ -77503,27 +77503,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 817.265625, - "completions/mean_terminated_length": 751.423828125, - "completions/min_length": 124.0, - "completions/min_terminated_length": 124.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1099.828125, + "completions/mean_terminated_length": 1059.2750244140625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, "epoch": 0.9125202696936076, - "grad_norm": 0.9762124419212341, - "kl": 6.7890625, - "learning_rate": 1.2092824785831342e-07, - "loss": 0.4285, - "num_tokens": 1425283618.0, - "reward": 1.8515625, - "reward_std": 0.5252445340156555, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.20585507154464722, + "grad_norm": 1.8155720233917236, + "kl": 2.060546875, + "learning_rate": 1.2094400530464508e-07, + "loss": 0.0958, + "num_tokens": 1534724625.0, + "reward": 1.115234375, + "reward_std": 0.42107412219047546, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.09765625, + "rewards/format_reward/std": 0.29713961482048035, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.20147298276424408, "step": 2673 }, { @@ -77532,27 +77532,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 704.01171875, - "completions/mean_terminated_length": 668.9979858398438, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1023.72265625, + "completions/mean_terminated_length": 950.8660888671875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.9128616540069984, - "grad_norm": 1.9770690202713013, - "kl": 6.04296875, - "learning_rate": 1.2076696095498203e-07, - "loss": 0.4296, - "num_tokens": 1425726760.0, - "reward": 1.8896484375, - "reward_std": 0.4971660077571869, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.17660093307495117, + "grad_norm": 2.5986690521240234, + "kl": 3.55078125, + "learning_rate": 1.2078259792394745e-07, + "loss": 0.2231, + "num_tokens": 1535331459.0, + "reward": 1.07763671875, + "reward_std": 0.4224942624568939, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.25592297315597534, + "rewards/tag_count_reward/mean": 0.88623046875, + "rewards/tag_count_reward/std": 0.2327636331319809, "step": 2674 }, { @@ -77561,27 +77561,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1918.0, - "completions/mean_length": 752.548828125, - "completions/mean_terminated_length": 716.1304931640625, - "completions/min_length": 72.0, - "completions/min_terminated_length": 72.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1084.291015625, + "completions/mean_terminated_length": 1026.4285888671875, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, "epoch": 0.9132030383203892, - "grad_norm": 2.3720006942749023, - "kl": 4.87890625, - "learning_rate": 1.2060628326944175e-07, - "loss": 0.3497, - "num_tokens": 1426193921.0, - "reward": 1.96044921875, - "reward_std": 0.46277543902397156, - "rewards/accuracy_reward/mean": 0.11328125, - "rewards/accuracy_reward/std": 0.3172462284564972, - "rewards/format_reward/mean": 0.900390625, - "rewards/format_reward/std": 0.29977133870124817, - "rewards/tag_count_reward/mean": 0.94677734375, - "rewards/tag_count_reward/std": 0.17267994582653046, + "grad_norm": 3.9908597469329834, + "kl": 1.9140625, + "learning_rate": 1.2062180020114684e-07, + "loss": 0.0654, + "num_tokens": 1535968472.0, + "reward": 1.22998046875, + "reward_std": 0.4509434103965759, + "rewards/accuracy_reward/mean": 0.181640625, + "rewards/accuracy_reward/std": 0.38592514395713806, + "rewards/format_reward/mean": 0.130859375, + "rewards/format_reward/std": 0.33757632970809937, + "rewards/tag_count_reward/mean": 0.91748046875, + "rewards/tag_count_reward/std": 0.19635149836540222, "step": 2675 }, { @@ -77590,27 +77590,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2016.0, - "completions/mean_length": 831.55078125, - "completions/mean_terminated_length": 789.7737426757812, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1122.892578125, + "completions/mean_terminated_length": 1052.926513671875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, "epoch": 0.91354442263378, - "grad_norm": 2.7585463523864746, - "kl": 6.359375, - "learning_rate": 1.204462150297452e-07, - "loss": 0.4475, - "num_tokens": 1426697723.0, - "reward": 1.84716796875, - "reward_std": 0.473530113697052, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.1851712167263031, + "grad_norm": 3.2811944484710693, + "kl": 2.09375, + "learning_rate": 1.204616123646394e-07, + "loss": 0.0985, + "num_tokens": 1536621441.0, + "reward": 1.0654296875, + "reward_std": 0.4020102024078369, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.30236753821372986, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.2108335644006729, "step": 2676 }, { @@ -77619,27 +77619,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 768.109375, - "completions/mean_terminated_length": 716.081298828125, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1105.826171875, + "completions/mean_terminated_length": 990.12060546875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, "epoch": 0.9138858069471708, - "grad_norm": 1.0900766849517822, - "kl": 6.47265625, - "learning_rate": 1.202867564630799e-07, - "loss": 0.4529, - "num_tokens": 1427177603.0, - "reward": 1.92919921875, - "reward_std": 0.5192950367927551, - "rewards/accuracy_reward/mean": 0.12890625, - "rewards/accuracy_reward/std": 0.33542385697364807, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19834154844284058, + "grad_norm": 5.379909992218018, + "kl": 2.9140625, + "learning_rate": 1.203020346419551e-07, + "loss": 0.1967, + "num_tokens": 1537274232.0, + "reward": 1.13818359375, + "reward_std": 0.4906888008117676, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.30236753821372986, + "rewards/tag_count_reward/mean": 0.88232421875, + "rewards/tag_count_reward/std": 0.23344410955905914, "step": 2677 }, { @@ -77648,27 +77648,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1911.0, - "completions/mean_length": 682.728515625, - "completions/mean_terminated_length": 649.9620361328125, - "completions/min_length": 2.0, - "completions/min_terminated_length": 2.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 992.142578125, + "completions/mean_terminated_length": 953.6700439453125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, "epoch": 0.9142271912605616, - "grad_norm": 1.6666417121887207, - "kl": 6.5078125, - "learning_rate": 1.2012790779576833e-07, - "loss": 0.4309, - "num_tokens": 1427608872.0, - "reward": 1.9052734375, - "reward_std": 0.5004695653915405, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.17237325012683868, + "grad_norm": 9.234182357788086, + "kl": 2.689453125, + "learning_rate": 1.2014306725975718e-07, + "loss": 0.0825, + "num_tokens": 1537863921.0, + "reward": 1.1611328125, + "reward_std": 0.4584944248199463, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3310423493385315, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.19871997833251953, "step": 2678 }, { @@ -77677,27 +77677,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1909.0, - "completions/mean_length": 788.189453125, - "completions/mean_terminated_length": 747.5503540039062, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1027.912109375, + "completions/mean_terminated_length": 986.445068359375, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, "epoch": 0.9145685755739523, - "grad_norm": 0.8680918216705322, - "kl": 5.93359375, - "learning_rate": 1.1996966925326677e-07, - "loss": 0.3804, - "num_tokens": 1428091657.0, - "reward": 1.92236328125, - "reward_std": 0.4979419708251953, + "grad_norm": 1.985854983329773, + "kl": 1.501953125, + "learning_rate": 1.1998471044384193e-07, + "loss": 0.0678, + "num_tokens": 1538469444.0, + "reward": 1.15087890625, + "reward_std": 0.37759196758270264, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.16821186244487762, + "rewards/format_reward/mean": 0.107421875, + "rewards/format_reward/std": 0.30995169281959534, + "rewards/tag_count_reward/mean": 0.94580078125, + "rewards/tag_count_reward/std": 0.16512742638587952, "step": 2679 }, { @@ -77706,27 +77706,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1841.0, - "completions/mean_length": 720.80859375, - "completions/mean_terminated_length": 686.2324829101562, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1044.900390625, + "completions/mean_terminated_length": 971.2976684570312, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, "epoch": 0.9149099598873431, - "grad_norm": 2.042712926864624, - "kl": 5.4140625, - "learning_rate": 1.1981204106016626e-07, - "loss": 0.3843, - "num_tokens": 1428542999.0, - "reward": 1.92919921875, - "reward_std": 0.47073543071746826, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.1699187308549881, + "grad_norm": 6.8311543464660645, + "kl": 3.8671875, + "learning_rate": 1.1982696441913848e-07, + "loss": 0.1944, + "num_tokens": 1539086721.0, + "reward": 1.140625, + "reward_std": 0.4593971371650696, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.12109375, + "rewards/format_reward/std": 0.3265552520751953, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.20353971421718597, "step": 2680 }, { @@ -77735,27 +77735,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 740.578125, - "completions/mean_terminated_length": 714.5338745117188, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1076.14453125, + "completions/mean_terminated_length": 1009.1900024414062, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.915251344200734, - "grad_norm": 1.3484684228897095, - "kl": 5.07421875, - "learning_rate": 1.196550234401909e-07, - "loss": 0.2929, - "num_tokens": 1429011535.0, - "reward": 1.87255859375, - "reward_std": 0.470352441072464, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.869140625, + "grad_norm": 4.175885200500488, + "kl": 2.83984375, + "learning_rate": 1.1966982940970833e-07, + "loss": 0.1206, + "num_tokens": 1539727067.0, + "reward": 1.158203125, + "reward_std": 0.45506709814071655, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.130859375, "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.18456129729747772, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.20388682186603546, "step": 2681 }, { @@ -77764,27 +77764,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1896.0, - "completions/mean_length": 815.771484375, - "completions/mean_terminated_length": 760.4468994140625, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1207.693359375, + "completions/mean_terminated_length": 1138.407958984375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, "epoch": 0.9155927285141248, - "grad_norm": 1.431976318359375, - "kl": 6.8828125, - "learning_rate": 1.194986166161986e-07, - "loss": 0.4446, - "num_tokens": 1429509402.0, - "reward": 1.8388671875, - "reward_std": 0.522997260093689, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9306640625, - "rewards/tag_count_reward/std": 0.1878843754529953, + "grad_norm": 4.638640403747559, + "kl": 2.34375, + "learning_rate": 1.1951330563874515e-07, + "loss": 0.0931, + "num_tokens": 1540425598.0, + "reward": 1.08642578125, + "reward_std": 0.45741555094718933, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.130859375, + "rewards/format_reward/std": 0.33757632970809937, + "rewards/tag_count_reward/mean": 0.89697265625, + "rewards/tag_count_reward/std": 0.2063227891921997, "step": 2682 }, { @@ -77793,27 +77793,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1938.0, - "completions/mean_length": 776.03515625, - "completions/mean_terminated_length": 748.1077880859375, - "completions/min_length": 115.0, - "completions/min_terminated_length": 115.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1007.828125, + "completions/mean_terminated_length": 958.9038696289062, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.9159341128275156, - "grad_norm": 1.2878795862197876, - "kl": 5.16796875, - "learning_rate": 1.1934282081018023e-07, - "loss": 0.3696, - "num_tokens": 1429978284.0, - "reward": 1.974609375, - "reward_std": 0.48818308115005493, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, - "rewards/format_reward/mean": 0.900390625, - "rewards/format_reward/std": 0.29977133870124817, - "rewards/tag_count_reward/mean": 0.94921875, - "rewards/tag_count_reward/std": 0.16436253488063812, + "grad_norm": 6.307188034057617, + "kl": 1.75, + "learning_rate": 1.1935739332857443e-07, + "loss": 0.1102, + "num_tokens": 1541013158.0, + "reward": 1.1845703125, + "reward_std": 0.39395949244499207, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, + "rewards/format_reward/mean": 0.08203125, + "rewards/format_reward/std": 0.2746807038784027, + "rewards/tag_count_reward/mean": 0.9306640625, + "rewards/tag_count_reward/std": 0.1750793755054474, "step": 2683 }, { @@ -77822,27 +77822,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1952.0, - "completions/mean_length": 734.421875, - "completions/mean_terminated_length": 700.200439453125, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1113.744140625, + "completions/mean_terminated_length": 1061.7340087890625, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, "epoch": 0.9162754971409064, - "grad_norm": 1.3206956386566162, - "kl": 5.94921875, - "learning_rate": 1.1918763624325942e-07, - "loss": 0.3782, - "num_tokens": 1430428660.0, - "reward": 1.916015625, - "reward_std": 0.5098152160644531, - "rewards/accuracy_reward/mean": 0.08870967477560043, - "rewards/accuracy_reward/std": 0.284611314535141, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.17828376591205597, + "grad_norm": 2.4510347843170166, + "kl": 3.064453125, + "learning_rate": 1.192020927006531e-07, + "loss": 0.1723, + "num_tokens": 1541657747.0, + "reward": 1.11962890625, + "reward_std": 0.4408426582813263, + "rewards/accuracy_reward/mean": 0.11895161122083664, + "rewards/accuracy_reward/std": 0.3240584135055542, + "rewards/format_reward/mean": 0.09765625, + "rewards/format_reward/std": 0.29713961482048035, + "rewards/tag_count_reward/mean": 0.90673828125, + "rewards/tag_count_reward/std": 0.20623478293418884, "step": 2684 }, { @@ -77851,27 +77851,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 699.388671875, - "completions/mean_terminated_length": 667.0220336914062, - "completions/min_length": 74.0, - "completions/min_terminated_length": 74.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1030.4375, + "completions/mean_terminated_length": 958.0585327148438, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.9166168814542972, - "grad_norm": 2.0574090480804443, - "kl": 5.3203125, - "learning_rate": 1.1903306313569242e-07, - "loss": 0.354, - "num_tokens": 1430859515.0, - "reward": 1.9482421875, - "reward_std": 0.4741098880767822, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.9501953125, - "rewards/tag_count_reward/std": 0.15391167998313904, + "grad_norm": 5.178215980529785, + "kl": 2.7890625, + "learning_rate": 1.1904740397556923e-07, + "loss": 0.1737, + "num_tokens": 1542258099.0, + "reward": 1.11181640625, + "reward_std": 0.41829195618629456, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.30236753821372986, + "rewards/tag_count_reward/mean": 0.90869140625, + "rewards/tag_count_reward/std": 0.2047327607870102, "step": 2685 }, { @@ -77880,27 +77880,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 828.232421875, - "completions/mean_terminated_length": 788.8850708007812, - "completions/min_length": 85.0, - "completions/min_terminated_length": 85.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1103.9296875, + "completions/mean_terminated_length": 1045.170166015625, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, "epoch": 0.916958265767688, - "grad_norm": 2.8428945541381836, - "kl": 6.96875, - "learning_rate": 1.1887910170686726e-07, - "loss": 0.4383, - "num_tokens": 1431354354.0, - "reward": 1.87060546875, - "reward_std": 0.49418410658836365, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.19477328658103943, + "grad_norm": 2.2769551277160645, + "kl": 2.41796875, + "learning_rate": 1.1889332737304179e-07, + "loss": 0.105, + "num_tokens": 1542894095.0, + "reward": 1.103515625, + "reward_std": 0.43565478920936584, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.1171875, + "rewards/format_reward/std": 0.32195815443992615, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.20448583364486694, "step": 2686 }, { @@ -77909,27 +77909,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 783.01953125, - "completions/mean_terminated_length": 757.8207397460938, - "completions/min_length": 85.0, - "completions/min_terminated_length": 85.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1023.865234375, + "completions/mean_terminated_length": 995.0742797851562, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, "epoch": 0.9172996500810787, - "grad_norm": 3.281104564666748, - "kl": 8.671875, - "learning_rate": 1.1872575217530422e-07, - "loss": 0.5114, - "num_tokens": 1431821788.0, - "reward": 1.85107421875, - "reward_std": 0.5400803089141846, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, + "grad_norm": 1.287642478942871, + "kl": 1.798828125, + "learning_rate": 1.187398631119203e-07, + "loss": 0.0665, + "num_tokens": 1543484842.0, + "reward": 1.14990234375, + "reward_std": 0.4154220223426819, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.103515625, + "rewards/format_reward/std": 0.30492907762527466, "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.19182707369327545, + "rewards/tag_count_reward/std": 0.18268270790576935, "step": 2687 }, { @@ -77938,27 +77938,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1983.0, - "completions/mean_length": 825.80859375, - "completions/mean_terminated_length": 770.9346313476562, - "completions/min_length": 85.0, - "completions/min_terminated_length": 85.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1111.095703125, + "completions/mean_terminated_length": 1074.98779296875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.9176410343944695, - "grad_norm": 3.8501455783843994, - "kl": 9.75, - "learning_rate": 1.1857301475865477e-07, - "loss": 0.5899, - "num_tokens": 1432317610.0, - "reward": 1.7919921875, - "reward_std": 0.5573383569717407, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.21086981892585754, + "grad_norm": 7.189255237579346, + "kl": 1.904296875, + "learning_rate": 1.1858701141018451e-07, + "loss": 0.0443, + "num_tokens": 1544126731.0, + "reward": 1.08447265625, + "reward_std": 0.41187939047813416, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.1843547374010086, + "rewards/format_reward/mean": 0.140625, + "rewards/format_reward/std": 0.3479743003845215, + "rewards/tag_count_reward/mean": 0.90869140625, + "rewards/tag_count_reward/std": 0.19619084894657135, "step": 2688 }, { @@ -77967,27 +77967,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1999.0, - "completions/mean_length": 844.7890625, - "completions/mean_terminated_length": 795.8779907226562, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1200.916015625, + "completions/mean_terminated_length": 1142.5574951171875, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, "epoch": 0.9179824187078603, - "grad_norm": 5.4735941886901855, - "kl": 9.4140625, - "learning_rate": 1.1842088967370173e-07, - "loss": 0.5074, - "num_tokens": 1432819678.0, - "reward": 1.7939453125, - "reward_std": 0.5715153813362122, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.9130859375, - "rewards/tag_count_reward/std": 0.2081148475408554, + "grad_norm": 2.024338722229004, + "kl": 2.0703125, + "learning_rate": 1.1843477248494401e-07, + "loss": 0.0654, + "num_tokens": 1544811136.0, + "reward": 1.17529296875, + "reward_std": 0.4895121455192566, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.162109375, + "rewards/format_reward/std": 0.3689115643501282, + "rewards/tag_count_reward/mean": 0.90966796875, + "rewards/tag_count_reward/std": 0.2057616412639618, "step": 2689 }, { @@ -77996,27 +77996,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 790.75, - "completions/mean_terminated_length": 739.6422729492188, - "completions/min_length": 115.0, - "completions/min_terminated_length": 115.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1079.283203125, + "completions/mean_terminated_length": 1029.554443359375, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, "epoch": 0.9183238030212512, - "grad_norm": 3.8919737339019775, - "kl": 8.8125, - "learning_rate": 1.1826937713635902e-07, - "loss": 0.5271, - "num_tokens": 1433308558.0, - "reward": 1.8359375, - "reward_std": 0.5754671096801758, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.20496191084384918, + "grad_norm": 1.5518217086791992, + "kl": 2.501953125, + "learning_rate": 1.1828314655243826e-07, + "loss": 0.1402, + "num_tokens": 1545447745.0, + "reward": 1.123046875, + "reward_std": 0.4509558081626892, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.126953125, + "rewards/format_reward/std": 0.33324605226516724, + "rewards/tag_count_reward/mean": 0.916015625, + "rewards/tag_count_reward/std": 0.19666332006454468, "step": 2690 }, { @@ -78025,27 +78025,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 806.310546875, - "completions/mean_terminated_length": 771.403564453125, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1154.5390625, + "completions/mean_terminated_length": 1096.956298828125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.918665187334642, - "grad_norm": 3.3554928302764893, - "kl": 7.59375, - "learning_rate": 1.1811847736167078e-07, - "loss": 0.4384, - "num_tokens": 1433792733.0, - "reward": 1.8203125, - "reward_std": 0.5457074642181396, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.826171875, - "rewards/format_reward/std": 0.3793322443962097, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.20338943600654602, + "grad_norm": 1.7109220027923584, + "kl": 2.2421875, + "learning_rate": 1.1813213382803569e-07, + "loss": 0.0852, + "num_tokens": 1546110213.0, + "reward": 1.1630859375, + "reward_std": 0.4548121392726898, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36344730854034424, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.2073236107826233, "step": 2691 }, { @@ -78054,27 +78054,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1957.0, - "completions/mean_length": 798.337890625, - "completions/mean_terminated_length": 755.4202270507812, - "completions/min_length": 67.0, - "completions/min_terminated_length": 67.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1093.4609375, + "completions/mean_terminated_length": 1036.1490478515625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.9190065716480328, - "grad_norm": 2.3615872859954834, - "kl": 7.328125, - "learning_rate": 1.1796819056381175e-07, - "loss": 0.4222, - "num_tokens": 1434272154.0, - "reward": 1.85595703125, - "reward_std": 0.5467626452445984, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.207458958029747, + "grad_norm": 2.703613519668579, + "kl": 2.705078125, + "learning_rate": 1.1798173452623397e-07, + "loss": 0.1685, + "num_tokens": 1546740737.0, + "reward": 1.1728515625, + "reward_std": 0.46241694688796997, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.31241437792778015, + "rewards/tag_count_reward/mean": 0.9052734375, + "rewards/tag_count_reward/std": 0.2052670568227768, "step": 2692 }, { @@ -78083,27 +78083,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1924.0, - "completions/mean_length": 747.86328125, - "completions/mean_terminated_length": 719.3173828125, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1032.85546875, + "completions/mean_terminated_length": 985.1083374023438, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.9193479559614236, - "grad_norm": 2.554333209991455, - "kl": 7.6484375, - "learning_rate": 1.178185169560865e-07, - "loss": 0.4774, - "num_tokens": 1434736372.0, - "reward": 1.859375, - "reward_std": 0.5730640292167664, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.18783605098724365, + "grad_norm": 4.711479663848877, + "kl": 2.115234375, + "learning_rate": 1.1783194886065931e-07, + "loss": 0.0682, + "num_tokens": 1547350871.0, + "reward": 1.21533203125, + "reward_std": 0.46634024381637573, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, + "rewards/format_reward/mean": 0.134765625, + "rewards/format_reward/std": 0.3418070077896118, + "rewards/tag_count_reward/mean": 0.92431640625, + "rewards/tag_count_reward/std": 0.1928505003452301, "step": 2693 }, { @@ -78112,27 +78112,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 796.974609375, - "completions/mean_terminated_length": 735.44873046875, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1100.568359375, + "completions/mean_terminated_length": 1037.40625, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, "epoch": 0.9196893402748144, - "grad_norm": 1.866373896598816, - "kl": 5.65625, - "learning_rate": 1.1766945675092938e-07, - "loss": 0.3898, - "num_tokens": 1435225591.0, - "reward": 1.873046875, - "reward_std": 0.5167508721351624, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.19384446740150452, + "grad_norm": 2.458343982696533, + "kl": 2.51171875, + "learning_rate": 1.1768277704406647e-07, + "loss": 0.129, + "num_tokens": 1547995530.0, + "reward": 1.10595703125, + "reward_std": 0.4494969844818115, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.1171875, + "rewards/format_reward/std": 0.32195815443992615, + "rewards/tag_count_reward/mean": 0.89306640625, + "rewards/tag_count_reward/std": 0.21933400630950928, "step": 2694 }, { @@ -78141,27 +78141,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 782.41015625, - "completions/mean_terminated_length": 733.6348876953125, - "completions/min_length": 8.0, - "completions/min_terminated_length": 8.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1025.275390625, + "completions/mean_terminated_length": 990.1515502929688, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.9200307245882051, - "grad_norm": 1.9280846118927002, - "kl": 6.5546875, - "learning_rate": 1.1752101015990404e-07, - "loss": 0.4142, - "num_tokens": 1435695225.0, - "reward": 1.8388671875, - "reward_std": 0.59491366147995, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.20715762674808502, + "grad_norm": 4.200669765472412, + "kl": 1.921875, + "learning_rate": 1.1753421928833825e-07, + "loss": 0.1133, + "num_tokens": 1548589511.0, + "reward": 1.197265625, + "reward_std": 0.4646907448768616, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.1328125, + "rewards/format_reward/std": 0.33970388770103455, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.17398715019226074, "step": 2695 }, { @@ -78170,27 +78170,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 826.38671875, - "completions/mean_terminated_length": 789.51708984375, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1151.126953125, + "completions/mean_terminated_length": 1068.897705078125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, "epoch": 0.9203721089015959, - "grad_norm": 2.4268991947174072, - "kl": 6.58203125, - "learning_rate": 1.1737317739370323e-07, - "loss": 0.441, - "num_tokens": 1436199871.0, - "reward": 1.82275390625, - "reward_std": 0.5685132145881653, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.833984375, - "rewards/format_reward/std": 0.3724585771560669, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.20937539637088776, + "grad_norm": 2.1564042568206787, + "kl": 3.02734375, + "learning_rate": 1.1738627580448519e-07, + "loss": 0.164, + "num_tokens": 1549260424.0, + "reward": 1.0966796875, + "reward_std": 0.46539485454559326, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.12890625, + "rewards/format_reward/std": 0.33542385697364807, + "rewards/tag_count_reward/mean": 0.8876953125, + "rewards/tag_count_reward/std": 0.22575154900550842, "step": 2696 }, { @@ -78199,27 +78199,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 780.99609375, - "completions/mean_terminated_length": 732.1663208007812, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1087.74609375, + "completions/mean_terminated_length": 1008.57080078125, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, "epoch": 0.9207134932149867, - "grad_norm": 2.5049359798431396, - "kl": 5.90625, - "learning_rate": 1.172259586621487e-07, - "loss": 0.4165, - "num_tokens": 1436681549.0, - "reward": 1.81884765625, - "reward_std": 0.5418437123298645, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.20545026659965515, + "grad_norm": 3.5450525283813477, + "kl": 1.59375, + "learning_rate": 1.1723894680264526e-07, + "loss": 0.0863, + "num_tokens": 1549899158.0, + "reward": 1.10009765625, + "reward_std": 0.4532034397125244, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.103515625, + "rewards/format_reward/std": 0.30492907762527466, + "rewards/tag_count_reward/mean": 0.89697265625, + "rewards/tag_count_reward/std": 0.2104315161705017, "step": 2697 }, { @@ -78228,27 +78228,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1910.0, - "completions/mean_length": 821.501953125, - "completions/mean_terminated_length": 784.48486328125, - "completions/min_length": 71.0, - "completions/min_terminated_length": 71.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1097.822265625, + "completions/mean_terminated_length": 1057.183349609375, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, "epoch": 0.9210548775283776, - "grad_norm": 1.1954436302185059, - "kl": 5.3359375, - "learning_rate": 1.170793541741903e-07, - "loss": 0.3431, - "num_tokens": 1437175518.0, - "reward": 1.86865234375, - "reward_std": 0.5039311051368713, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.18960754573345184, + "grad_norm": 3.134887456893921, + "kl": 2.40234375, + "learning_rate": 1.170922324920839e-07, + "loss": 0.125, + "num_tokens": 1550534603.0, + "reward": 1.142578125, + "reward_std": 0.4391120970249176, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.142578125, + "rewards/format_reward/std": 0.3499840497970581, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.19246920943260193, "step": 2698 }, { @@ -78257,27 +78257,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 803.15625, - "completions/mean_terminated_length": 749.9144897460938, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1094.44140625, + "completions/mean_terminated_length": 1039.27685546875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, "epoch": 0.9213962618417684, - "grad_norm": 1.1793044805526733, - "kl": 7.30859375, - "learning_rate": 1.169333641379065e-07, - "loss": 0.4721, - "num_tokens": 1437670638.0, - "reward": 1.7958984375, - "reward_std": 0.5920631885528564, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.9111328125, - "rewards/tag_count_reward/std": 0.2043152004480362, + "grad_norm": 1.6496870517730713, + "kl": 2.35546875, + "learning_rate": 1.1694613308119312e-07, + "loss": 0.1116, + "num_tokens": 1551178861.0, + "reward": 1.154296875, + "reward_std": 0.4800220727920532, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.154296875, + "rewards/format_reward/std": 0.36158639192581177, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.20392431318759918, "step": 2699 }, { @@ -78286,27 +78286,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 787.669921875, - "completions/mean_terminated_length": 739.0973510742188, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1087.705078125, + "completions/mean_terminated_length": 1008.5264282226562, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.9217376461551592, - "grad_norm": 1.351790428161621, - "kl": 7.28125, - "learning_rate": 1.167879887605032e-07, - "loss": 0.4765, - "num_tokens": 1438154133.0, - "reward": 1.787109375, - "reward_std": 0.5768707990646362, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.908203125, - "rewards/tag_count_reward/std": 0.21699129045009613, + "grad_norm": 2.2735061645507812, + "kl": 2.35546875, + "learning_rate": 1.1680064877749168e-07, + "loss": 0.1228, + "num_tokens": 1551815974.0, + "reward": 1.1103515625, + "reward_std": 0.4287722110748291, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.123046875, + "rewards/format_reward/std": 0.32881227135658264, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.21141289174556732, "step": 2700 }, { @@ -78315,27 +78315,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1887.0, - "completions/mean_length": 772.0234375, - "completions/mean_terminated_length": 728.2020263671875, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1089.296875, + "completions/mean_terminated_length": 1021.1045532226562, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.92207903046855, - "grad_norm": 1.2594962120056152, - "kl": 6.8046875, - "learning_rate": 1.1664322824831437e-07, - "loss": 0.4079, - "num_tokens": 1438622721.0, - "reward": 1.80126953125, - "reward_std": 0.5574404001235962, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.19656065106391907, + "grad_norm": 3.1619954109191895, + "kl": 2.251953125, + "learning_rate": 1.1665577978762473e-07, + "loss": 0.095, + "num_tokens": 1552447006.0, + "reward": 1.14501953125, + "reward_std": 0.4418213963508606, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.13671875, + "rewards/format_reward/std": 0.3438861668109894, + "rewards/tag_count_reward/mean": 0.90673828125, + "rewards/tag_count_reward/std": 0.1914730817079544, "step": 2701 }, { @@ -78344,27 +78344,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 830.994140625, - "completions/mean_terminated_length": 789.197998046875, - "completions/min_length": 25.0, - "completions/min_terminated_length": 25.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1098.712890625, + "completions/mean_terminated_length": 1049.9815673828125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, "epoch": 0.9224204147819408, - "grad_norm": 1.5090302228927612, - "kl": 5.8515625, - "learning_rate": 1.1649908280680094e-07, - "loss": 0.363, - "num_tokens": 1439121950.0, - "reward": 1.84423828125, - "reward_std": 0.5673857927322388, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.19360709190368652, + "grad_norm": 3.9778926372528076, + "kl": 2.37890625, + "learning_rate": 1.165115263173633e-07, + "loss": 0.0651, + "num_tokens": 1553083307.0, + "reward": 1.189453125, + "reward_std": 0.5304901003837585, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.16015625, + "rewards/format_reward/std": 0.3671095669269562, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.20690147578716278, "step": 2702 }, { @@ -78373,27 +78373,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 805.22265625, - "completions/mean_terminated_length": 762.5414428710938, - "completions/min_length": 83.0, - "completions/min_terminated_length": 83.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1104.154296875, + "completions/mean_terminated_length": 1039.1295166015625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, "epoch": 0.9227617990953315, - "grad_norm": 0.9459572434425354, - "kl": 7.421875, - "learning_rate": 1.1635555264055105e-07, - "loss": 0.4642, - "num_tokens": 1439611888.0, - "reward": 1.8203125, - "reward_std": 0.591699481010437, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.837890625, + "grad_norm": 1.8726797103881836, + "kl": 2.634765625, + "learning_rate": 1.1636788857160406e-07, + "loss": 0.1127, + "num_tokens": 1553726298.0, + "reward": 1.162109375, + "reward_std": 0.4818079471588135, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.162109375, "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.21201092004776, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.20180471241474152, "step": 2703 }, { @@ -78402,27 +78402,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 734.56640625, - "completions/mean_terminated_length": 705.728515625, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1077.169921875, + "completions/mean_terminated_length": 1014.600830078125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, "epoch": 0.9231031834087223, - "grad_norm": 2.5126640796661377, - "kl": 5.015625, - "learning_rate": 1.1621263795327965e-07, - "loss": 0.3573, - "num_tokens": 1440070114.0, - "reward": 1.92041015625, - "reward_std": 0.4671206772327423, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94580078125, - "rewards/tag_count_reward/std": 0.16806410253047943, + "grad_norm": 3.472248077392578, + "kl": 2.671875, + "learning_rate": 1.1622486675436945e-07, + "loss": 0.1208, + "num_tokens": 1554359937.0, + "reward": 1.18212890625, + "reward_std": 0.5241494178771973, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.3776407241821289, + "rewards/tag_count_reward/mean": 0.90673828125, + "rewards/tag_count_reward/std": 0.21380557119846344, "step": 2704 }, { @@ -78431,27 +78431,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 764.75390625, - "completions/mean_terminated_length": 733.9560546875, - "completions/min_length": 78.0, - "completions/min_terminated_length": 78.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1064.9609375, + "completions/mean_terminated_length": 1010.235107421875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.9234445677221131, - "grad_norm": 1.2936022281646729, - "kl": 7.40625, - "learning_rate": 1.1607033894782782e-07, - "loss": 0.4485, - "num_tokens": 1440541268.0, - "reward": 1.81689453125, - "reward_std": 0.5451605319976807, - "rewards/accuracy_reward/mean": 0.0625, - "rewards/accuracy_reward/std": 0.2422981858253479, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.2010640949010849, + "grad_norm": 2.5503907203674316, + "kl": 1.939453125, + "learning_rate": 1.1608246106880662e-07, + "loss": 0.0969, + "num_tokens": 1554984797.0, + "reward": 1.123046875, + "reward_std": 0.4415420889854431, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.12890625, + "rewards/format_reward/std": 0.33542385697364807, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.2104184627532959, "step": 2705 }, { @@ -78460,27 +78460,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, - "completions/mean_length": 809.0625, - "completions/mean_terminated_length": 766.51318359375, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/mean_length": 1136.271484375, + "completions/mean_terminated_length": 1063.1793212890625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.923785952035504, - "grad_norm": 1.8083739280700684, - "kl": 7.9140625, - "learning_rate": 1.1592865582616306e-07, - "loss": 0.4782, - "num_tokens": 1441032900.0, - "reward": 1.87060546875, - "reward_std": 0.5810139775276184, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.19933150708675385, + "grad_norm": 2.1283445358276367, + "kl": 2.53515625, + "learning_rate": 1.1594067171718788e-07, + "loss": 0.1347, + "num_tokens": 1555643960.0, + "reward": 1.216796875, + "reward_std": 0.5272696018218994, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, + "rewards/format_reward/mean": 0.16015625, + "rewards/format_reward/std": 0.3671095669269562, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.21067261695861816, "step": 2706 }, { @@ -78489,27 +78489,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1911.0, - "completions/mean_length": 704.283203125, - "completions/mean_terminated_length": 669.2765502929688, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1044.380859375, + "completions/mean_terminated_length": 954.6957397460938, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, "epoch": 0.9241273363488948, - "grad_norm": 1.5286200046539307, - "kl": 6.203125, - "learning_rate": 1.1578758878937856e-07, - "loss": 0.3854, - "num_tokens": 1441476325.0, - "reward": 1.92724609375, - "reward_std": 0.5413064956665039, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.18389739096164703, + "grad_norm": 3.392817497253418, + "kl": 3.81640625, + "learning_rate": 1.1579949890090993e-07, + "loss": 0.2627, + "num_tokens": 1556261515.0, + "reward": 1.1630859375, + "reward_std": 0.4517636299133301, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.111328125, + "rewards/format_reward/std": 0.31484565138816833, + "rewards/tag_count_reward/mean": 0.8916015625, + "rewards/tag_count_reward/std": 0.22711879014968872, "step": 2707 }, { @@ -78518,27 +78518,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 746.544921875, - "completions/mean_terminated_length": 709.9578247070312, - "completions/min_length": 44.0, - "completions/min_terminated_length": 44.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1028.248046875, + "completions/mean_terminated_length": 975.8994140625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.9244687206622856, - "grad_norm": 2.0551059246063232, - "kl": 7.453125, - "learning_rate": 1.1564713803769327e-07, - "loss": 0.4896, - "num_tokens": 1441931740.0, - "reward": 1.861328125, - "reward_std": 0.5031090974807739, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.18961825966835022, + "grad_norm": 2.8030009269714355, + "kl": 2.7734375, + "learning_rate": 1.15658942820494e-07, + "loss": 0.1118, + "num_tokens": 1556861162.0, + "reward": 1.083984375, + "reward_std": 0.46771833300590515, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.126953125, + "rewards/format_reward/std": 0.33324605226516724, + "rewards/tag_count_reward/mean": 0.896484375, + "rewards/tag_count_reward/std": 0.21677981317043304, "step": 2708 }, { @@ -78547,27 +78547,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 822.95703125, - "completions/mean_terminated_length": 767.955078125, - "completions/min_length": 115.0, - "completions/min_terminated_length": 115.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1135.125, + "completions/mean_terminated_length": 1072.23388671875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.9248101049756764, - "grad_norm": 1.6736626625061035, - "kl": 7.71875, - "learning_rate": 1.1550730377045126e-07, - "loss": 0.4882, - "num_tokens": 1442435782.0, - "reward": 1.8505859375, - "reward_std": 0.530368447303772, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.1946192979812622, + "grad_norm": 3.146209239959717, + "kl": 2.8984375, + "learning_rate": 1.1551900367558484e-07, + "loss": 0.1272, + "num_tokens": 1557525034.0, + "reward": 1.08349609375, + "reward_std": 0.46234554052352905, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.1328125, + "rewards/format_reward/std": 0.33970388770103455, + "rewards/tag_count_reward/mean": 0.88427734375, + "rewards/tag_count_reward/std": 0.22860905528068542, "step": 2709 }, { @@ -78576,27 +78576,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1936.0, - "completions/mean_length": 776.63671875, - "completions/mean_terminated_length": 748.7225341796875, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1149.5, + "completions/mean_terminated_length": 1067.12158203125, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, "epoch": 0.9251514892890672, - "grad_norm": 1.986305594444275, - "kl": 5.0625, - "learning_rate": 1.1536808618612175e-07, - "loss": 0.3118, - "num_tokens": 1442914828.0, - "reward": 1.8603515625, - "reward_std": 0.45481055974960327, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.17934982478618622, + "grad_norm": 2.067101240158081, + "kl": 2.736328125, + "learning_rate": 1.1537968166495146e-07, + "loss": 0.1577, + "num_tokens": 1558194986.0, + "reward": 1.12158203125, + "reward_std": 0.4547417163848877, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.154296875, + "rewards/format_reward/std": 0.36158639192581177, + "rewards/tag_count_reward/mean": 0.89111328125, + "rewards/tag_count_reward/std": 0.21668227016925812, "step": 2710 }, { @@ -78605,27 +78605,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 768.765625, - "completions/mean_terminated_length": 732.8031616210938, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1128.544921875, + "completions/mean_terminated_length": 1052.733642578125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.925492873602458, - "grad_norm": 1.7764705419540405, - "kl": 5.5390625, - "learning_rate": 1.1522948548229875e-07, - "loss": 0.362, - "num_tokens": 1443393476.0, - "reward": 1.88330078125, - "reward_std": 0.47507160902023315, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17322689294815063, + "grad_norm": 77.16455078125, + "kl": 5.73828125, + "learning_rate": 1.1524097698648583e-07, + "loss": 0.283, + "num_tokens": 1558857841.0, + "reward": 1.11376953125, + "reward_std": 0.5110874772071838, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.15234375, + "rewards/format_reward/std": 0.35970520973205566, + "rewards/tag_count_reward/mean": 0.86962890625, + "rewards/tag_count_reward/std": 0.24524930119514465, "step": 2711 }, { @@ -78634,27 +78634,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1953.0, - "completions/mean_length": 727.130859375, - "completions/mean_terminated_length": 689.9979858398438, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 979.322265625, + "completions/mean_terminated_length": 933.6151123046875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.9258342579158487, - "grad_norm": 0.7059771418571472, - "kl": 5.984375, - "learning_rate": 1.150915018557004e-07, - "loss": 0.3872, - "num_tokens": 1443840919.0, - "reward": 1.93603515625, - "reward_std": 0.4882371127605438, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.94775390625, - "rewards/tag_count_reward/std": 0.1620255559682846, + "grad_norm": 1.4812023639678955, + "kl": 2.18359375, + "learning_rate": 1.1510288983720338e-07, + "loss": 0.0687, + "num_tokens": 1559434406.0, + "reward": 1.21630859375, + "reward_std": 0.45011234283447266, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.146484375, + "rewards/format_reward/std": 0.35393697023391724, + "rewards/tag_count_reward/mean": 0.93115234375, + "rewards/tag_count_reward/std": 0.17906923592090607, "step": 2712 }, { @@ -78663,27 +78663,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1949.0, - "completions/mean_length": 771.474609375, - "completions/mean_terminated_length": 746.0458374023438, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1081.162109375, + "completions/mean_terminated_length": 1033.6126708984375, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, "epoch": 0.9261756422292395, - "grad_norm": 0.8078057765960693, - "kl": 4.859375, - "learning_rate": 1.1495413550216933e-07, - "loss": 0.3087, - "num_tokens": 1444311786.0, - "reward": 1.919921875, - "reward_std": 0.4650794267654419, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.16459491848945618, + "grad_norm": 1.109947681427002, + "kl": 1.9453125, + "learning_rate": 1.1496542041324228e-07, + "loss": 0.0464, + "num_tokens": 1560063833.0, + "reward": 1.18896484375, + "reward_std": 0.5248522758483887, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.181640625, + "rewards/format_reward/std": 0.38592514395713806, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.20127783715724945, "step": 2713 }, { @@ -78692,27 +78692,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 798.470703125, - "completions/mean_terminated_length": 765.9178466796875, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1113.748046875, + "completions/mean_terminated_length": 1036.7166748046875, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, "epoch": 0.9265170265426304, - "grad_norm": 0.7098759412765503, - "kl": 5.828125, - "learning_rate": 1.1481738661667192e-07, - "loss": 0.3736, - "num_tokens": 1444795531.0, - "reward": 1.85205078125, - "reward_std": 0.5454970002174377, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.19125337898731232, + "grad_norm": 2.1435954570770264, + "kl": 1.86328125, + "learning_rate": 1.1482856890986333e-07, + "loss": 0.0855, + "num_tokens": 1560709000.0, + "reward": 1.1064453125, + "reward_std": 0.44914382696151733, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.1328125, + "rewards/format_reward/std": 0.33970388770103455, + "rewards/tag_count_reward/mean": 0.9013671875, + "rewards/tag_count_reward/std": 0.20580634474754333, "step": 2714 }, { @@ -78721,27 +78721,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 755.12109375, - "completions/mean_terminated_length": 726.7345581054688, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1126.5859375, + "completions/mean_terminated_length": 1050.613037109375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.9268584108560212, - "grad_norm": 1.1023526191711426, - "kl": 3.90625, - "learning_rate": 1.1468125539329826e-07, - "loss": 0.2647, - "num_tokens": 1445257993.0, - "reward": 1.94580078125, - "reward_std": 0.4347952604293823, - "rewards/accuracy_reward/mean": 0.087890625, - "rewards/accuracy_reward/std": 0.2834126651287079, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.95361328125, - "rewards/tag_count_reward/std": 0.15378980338573456, + "grad_norm": 1.5562067031860352, + "kl": 2.54296875, + "learning_rate": 1.1469233552144955e-07, + "loss": 0.1206, + "num_tokens": 1561361652.0, + "reward": 1.1826171875, + "reward_std": 0.509443461894989, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.15234375, + "rewards/format_reward/std": 0.35970520973205566, + "rewards/tag_count_reward/mean": 0.8974609375, + "rewards/tag_count_reward/std": 0.21554872393608093, "step": 2715 }, { @@ -78750,27 +78750,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 780.587890625, - "completions/mean_terminated_length": 750.1700439453125, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1095.123046875, + "completions/mean_terminated_length": 1037.9110107421875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.927199795169412, - "grad_norm": 1.2092480659484863, - "kl": 4.3359375, - "learning_rate": 1.1454574202526165e-07, - "loss": 0.2947, - "num_tokens": 1445737494.0, - "reward": 1.94775390625, - "reward_std": 0.43313342332839966, - "rewards/accuracy_reward/mean": 0.08467742055654526, - "rewards/accuracy_reward/std": 0.278682142496109, - "rewards/format_reward/mean": 0.912109375, - "rewards/format_reward/std": 0.2834126651287079, - "rewards/tag_count_reward/mean": 0.95361328125, - "rewards/tag_count_reward/std": 0.15458305180072784, + "grad_norm": 2.301633834838867, + "kl": 2.78515625, + "learning_rate": 1.1455672044150609e-07, + "loss": 0.1299, + "num_tokens": 1562002195.0, + "reward": 1.16845703125, + "reward_std": 0.5061001181602478, + "rewards/accuracy_reward/mean": 0.0947580635547638, + "rewards/accuracy_reward/std": 0.29317617416381836, + "rewards/format_reward/mean": 0.17578125, + "rewards/format_reward/std": 0.3810062110424042, + "rewards/tag_count_reward/mean": 0.90087890625, + "rewards/tag_count_reward/std": 0.21345220506191254, "step": 2716 }, { @@ -78779,27 +78779,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 732.1015625, - "completions/mean_terminated_length": 711.21435546875, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1084.818359375, + "completions/mean_terminated_length": 1003.1928100585938, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.9275411794828028, - "grad_norm": 1.404892921447754, - "kl": 4.66015625, - "learning_rate": 1.1441084670489857e-07, - "loss": 0.3157, - "num_tokens": 1446183898.0, - "reward": 1.95556640625, - "reward_std": 0.44245997071266174, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.95166015625, - "rewards/tag_count_reward/std": 0.15867657959461212, + "grad_norm": 2.4795172214508057, + "kl": 3.16015625, + "learning_rate": 1.1442172386265972e-07, + "loss": 0.1624, + "num_tokens": 1562629190.0, + "reward": 1.13037109375, + "reward_std": 0.48734521865844727, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.123046875, + "rewards/format_reward/std": 0.32881227135658264, + "rewards/tag_count_reward/mean": 0.88623046875, + "rewards/tag_count_reward/std": 0.22690992057323456, "step": 2717 }, { @@ -78808,27 +78808,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 748.970703125, - "completions/mean_terminated_length": 720.4490966796875, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1045.966796875, + "completions/mean_terminated_length": 1013.6431274414062, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, "epoch": 0.9278825637961936, - "grad_norm": 2.1646533012390137, - "kl": 5.8828125, - "learning_rate": 1.1427656962366829e-07, - "loss": 0.4138, - "num_tokens": 1446652779.0, - "reward": 1.87548828125, - "reward_std": 0.49714913964271545, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17251938581466675, + "grad_norm": 1.6832196712493896, + "kl": 2.1953125, + "learning_rate": 1.142873459766589e-07, + "loss": 0.0979, + "num_tokens": 1563250133.0, + "reward": 1.1728515625, + "reward_std": 0.4586141109466553, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.1484375, + "rewards/format_reward/std": 0.35588082671165466, + "rewards/tag_count_reward/mean": 0.9287109375, + "rewards/tag_count_reward/std": 0.1804969757795334, "step": 2718 }, { @@ -78837,27 +78837,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1929.0, - "completions/mean_length": 790.986328125, - "completions/mean_terminated_length": 755.6485595703125, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1067.73828125, + "completions/mean_terminated_length": 1032.020263671875, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, "epoch": 0.9282239481095844, - "grad_norm": 1.5209766626358032, - "kl": 6.0390625, - "learning_rate": 1.1414291097215244e-07, - "loss": 0.38, - "num_tokens": 1447135812.0, - "reward": 1.8623046875, - "reward_std": 0.4664610028266907, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.16874286532402039, + "grad_norm": 1.448181390762329, + "kl": 1.884765625, + "learning_rate": 1.1415358697437315e-07, + "loss": 0.0796, + "num_tokens": 1563874863.0, + "reward": 1.1162109375, + "reward_std": 0.43645647168159485, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.146484375, + "rewards/format_reward/std": 0.35393697023391724, + "rewards/tag_count_reward/mean": 0.9150390625, + "rewards/tag_count_reward/std": 0.19561834633350372, "step": 2719 }, { @@ -78866,27 +78866,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, - "completions/mean_length": 776.294921875, - "completions/mean_terminated_length": 737.9134521484375, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/mean_length": 1046.33203125, + "completions/mean_terminated_length": 999.2188110351562, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, "epoch": 0.9285653324229751, - "grad_norm": 1.9546750783920288, - "kl": 6.53125, - "learning_rate": 1.1400987094005518e-07, - "loss": 0.4213, - "num_tokens": 1447604603.0, - "reward": 1.89208984375, - "reward_std": 0.4767943024635315, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.172895610332489, + "grad_norm": 1.838381290435791, + "kl": 2.22265625, + "learning_rate": 1.1402044704579305e-07, + "loss": 0.0953, + "num_tokens": 1564481913.0, + "reward": 1.16845703125, + "reward_std": 0.4816431701183319, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.169921875, + "rewards/format_reward/std": 0.3759314715862274, + "rewards/tag_count_reward/mean": 0.91845703125, + "rewards/tag_count_reward/std": 0.19362683594226837, "step": 2720 }, { @@ -78895,27 +78895,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1901.0, - "completions/mean_length": 806.2734375, - "completions/mean_terminated_length": 779.0099487304688, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 1145.505859375, + "completions/mean_terminated_length": 1079.2850341796875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.9289067167363659, - "grad_norm": 1.4141117334365845, - "kl": 5.5546875, - "learning_rate": 1.1387744971620236e-07, - "loss": 0.3895, - "num_tokens": 1448095559.0, - "reward": 1.8623046875, - "reward_std": 0.48223400115966797, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.17795921862125397, + "grad_norm": 2.999922513961792, + "kl": 3.14453125, + "learning_rate": 1.1388792638002969e-07, + "loss": 0.1356, + "num_tokens": 1565146556.0, + "reward": 1.10546875, + "reward_std": 0.49174898862838745, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.130859375, + "rewards/format_reward/std": 0.33757632970809937, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.2307935357093811, "step": 2721 }, { @@ -78924,27 +78924,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 711.4765625, - "completions/mean_terminated_length": 673.903564453125, - "completions/min_length": 63.0, - "completions/min_terminated_length": 63.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1031.849609375, + "completions/mean_terminated_length": 988.3890380859375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.9292481010497567, - "grad_norm": 2.679053544998169, - "kl": 6.5, - "learning_rate": 1.137456474885418e-07, - "loss": 0.3861, - "num_tokens": 1448537531.0, - "reward": 1.943359375, - "reward_std": 0.5373616218566895, - "rewards/accuracy_reward/mean": 0.140625, - "rewards/accuracy_reward/std": 0.3479743003845215, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.17900052666664124, + "grad_norm": 5.153967380523682, + "kl": 2.70703125, + "learning_rate": 1.1375602516531472e-07, + "loss": 0.1467, + "num_tokens": 1565752559.0, + "reward": 1.25146484375, + "reward_std": 0.5501196384429932, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, + "rewards/format_reward/mean": 0.169921875, + "rewards/format_reward/std": 0.3759314715862274, + "rewards/tag_count_reward/mean": 0.90185546875, + "rewards/tag_count_reward/std": 0.22121235728263855, "step": 2722 }, { @@ -78953,27 +78953,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 756.78515625, - "completions/mean_terminated_length": 712.4404296875, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1080.392578125, + "completions/mean_terminated_length": 1002.8206176757812, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, "epoch": 0.9295894853631476, - "grad_norm": 6.534995079040527, - "kl": 9.046875, - "learning_rate": 1.136144644441426e-07, - "loss": 0.5285, - "num_tokens": 1449003133.0, - "reward": 1.86669921875, - "reward_std": 0.5419111847877502, - "rewards/accuracy_reward/mean": 0.10080645233392715, - "rewards/accuracy_reward/std": 0.30137622356414795, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.19733208417892456, + "grad_norm": 4.036587238311768, + "kl": 2.759765625, + "learning_rate": 1.136247435889998e-07, + "loss": 0.1497, + "num_tokens": 1566383848.0, + "reward": 1.1806640625, + "reward_std": 0.5243150591850281, + "rewards/accuracy_reward/mean": 0.13508065044879913, + "rewards/accuracy_reward/std": 0.3421548008918762, + "rewards/format_reward/mean": 0.1640625, + "rewards/format_reward/std": 0.37069445848464966, + "rewards/tag_count_reward/mean": 0.8857421875, + "rewards/tag_count_reward/std": 0.22693359851837158, "step": 2723 }, { @@ -78982,27 +78982,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1907.0, - "completions/mean_length": 755.728515625, - "completions/mean_terminated_length": 719.3995971679688, - "completions/min_length": 114.0, - "completions/min_terminated_length": 114.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1043.9921875, + "completions/mean_terminated_length": 977.0584106445312, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, "epoch": 0.9299308696765384, - "grad_norm": 5.791825294494629, - "kl": 8.21875, - "learning_rate": 1.1348390076919519e-07, - "loss": 0.4888, - "num_tokens": 1449465458.0, - "reward": 1.828125, - "reward_std": 0.5473470687866211, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.19091396033763885, + "grad_norm": 1.3248389959335327, + "kl": 3.14453125, + "learning_rate": 1.134940818375565e-07, + "loss": 0.1424, + "num_tokens": 1566993764.0, + "reward": 1.1865234375, + "reward_std": 0.5169667601585388, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.185546875, + "rewards/format_reward/std": 0.38912075757980347, + "rewards/tag_count_reward/mean": 0.8916015625, + "rewards/tag_count_reward/std": 0.22000661492347717, "step": 2724 }, { @@ -79011,27 +79011,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1799.0, - "completions/mean_length": 826.548828125, - "completions/mean_terminated_length": 774.3075561523438, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1090.9296875, + "completions/mean_terminated_length": 1016.37890625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.9302722539899292, - "grad_norm": 4.375201225280762, - "kl": 8.8359375, - "learning_rate": 1.1335395664901071e-07, - "loss": 0.5388, - "num_tokens": 1449975883.0, - "reward": 1.8525390625, - "reward_std": 0.5686639547348022, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.2015654444694519, + "grad_norm": 3.957882881164551, + "kl": 3.3046875, + "learning_rate": 1.1336404009657603e-07, + "loss": 0.1403, + "num_tokens": 1567639552.0, + "reward": 1.17138671875, + "reward_std": 0.48888540267944336, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3310423493385315, + "rewards/tag_count_reward/mean": 0.88818359375, + "rewards/tag_count_reward/std": 0.22244861721992493, "step": 2725 }, { @@ -79040,27 +79040,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 831.712890625, - "completions/mean_terminated_length": 787.394775390625, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1106.685546875, + "completions/mean_terminated_length": 1033.362060546875, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, "epoch": 0.93061363830332, - "grad_norm": 3.420452833175659, - "kl": 7.640625, - "learning_rate": 1.1322463226802109e-07, - "loss": 0.4433, - "num_tokens": 1450487784.0, - "reward": 1.8232421875, - "reward_std": 0.5823479890823364, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.8203125, - "rewards/format_reward/std": 0.38430243730545044, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.19893142580986023, + "grad_norm": 2.481177806854248, + "kl": 3.6015625, + "learning_rate": 1.1323461855076901e-07, + "loss": 0.2086, + "num_tokens": 1568292239.0, + "reward": 1.1142578125, + "reward_std": 0.5196710228919983, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.146484375, + "rewards/format_reward/std": 0.35393697023391724, + "rewards/tag_count_reward/mean": 0.8583984375, + "rewards/tag_count_reward/std": 0.2506698966026306, "step": 2726 }, { @@ -79069,27 +79069,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 820.052734375, - "completions/mean_terminated_length": 788.0621337890625, - "completions/min_length": 62.0, - "completions/min_terminated_length": 62.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1073.060546875, + "completions/mean_terminated_length": 1003.71337890625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.9309550226167108, - "grad_norm": 1.4049073457717896, - "kl": 6.609375, - "learning_rate": 1.1309592780977867e-07, - "loss": 0.4338, - "num_tokens": 1450988467.0, - "reward": 1.859375, - "reward_std": 0.5390852689743042, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.196245014667511, + "grad_norm": 2.6814558506011963, + "kl": 3.03515625, + "learning_rate": 1.1310581738396499e-07, + "loss": 0.1548, + "num_tokens": 1568922462.0, + "reward": 1.18408203125, + "reward_std": 0.5056514143943787, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.197265625, + "rewards/format_reward/std": 0.3983237147331238, + "rewards/tag_count_reward/mean": 0.89111328125, + "rewards/tag_count_reward/std": 0.21724599599838257, "step": 2727 }, { @@ -79098,27 +79098,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1943.0, - "completions/mean_length": 789.669921875, - "completions/mean_terminated_length": 735.8513793945312, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1047.892578125, + "completions/mean_terminated_length": 981.2188110351562, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, "epoch": 0.9312964069301015, - "grad_norm": 1.4815195798873901, - "kl": 7.2265625, - "learning_rate": 1.1296784345695585e-07, - "loss": 0.4875, - "num_tokens": 1451470842.0, - "reward": 1.84814453125, - "reward_std": 0.5590066313743591, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.18521249294281006, + "grad_norm": 2.15360689163208, + "kl": 3.21484375, + "learning_rate": 1.1297763677911238e-07, + "loss": 0.166, + "num_tokens": 1569537047.0, + "reward": 1.1748046875, + "reward_std": 0.517542839050293, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.197265625, + "rewards/format_reward/std": 0.3983237147331238, + "rewards/tag_count_reward/mean": 0.8935546875, + "rewards/tag_count_reward/std": 0.21648648381233215, "step": 2728 }, { @@ -79127,27 +79127,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 850.826171875, - "completions/mean_terminated_length": 812.2076416015625, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1109.849609375, + "completions/mean_terminated_length": 1053.521728515625, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, "epoch": 0.9316377912434923, - "grad_norm": 1.1708332300186157, - "kl": 6.171875, - "learning_rate": 1.1284037939134502e-07, - "loss": 0.3786, - "num_tokens": 1451986945.0, - "reward": 1.87060546875, - "reward_std": 0.5346391797065735, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19823069870471954, + "grad_norm": 1.6677082777023315, + "kl": 2.0546875, + "learning_rate": 1.128500769182781e-07, + "loss": 0.0769, + "num_tokens": 1570185770.0, + "reward": 1.251953125, + "reward_std": 0.4941042959690094, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, + "rewards/format_reward/mean": 0.1875, + "rewards/format_reward/std": 0.39069411158561707, + "rewards/tag_count_reward/mean": 0.908203125, + "rewards/tag_count_reward/std": 0.19249899685382843, "step": 2729 }, { @@ -79156,27 +79156,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 796.201171875, - "completions/mean_terminated_length": 763.5891723632812, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1051.677734375, + "completions/mean_terminated_length": 991.857177734375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.9319791755568831, - "grad_norm": 0.9302889108657837, - "kl": 6.78125, - "learning_rate": 1.1271353579385804e-07, - "loss": 0.4129, - "num_tokens": 1452470968.0, - "reward": 1.8251953125, - "reward_std": 0.560287356376648, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.1988353431224823, + "grad_norm": 2.0316929817199707, + "kl": 2.61328125, + "learning_rate": 1.1272313798264753e-07, + "loss": 0.0963, + "num_tokens": 1570800597.0, + "reward": 1.1689453125, + "reward_std": 0.5213490724563599, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.189453125, + "rewards/format_reward/std": 0.3922513723373413, + "rewards/tag_count_reward/mean": 0.8876953125, + "rewards/tag_count_reward/std": 0.2284444123506546, "step": 2730 }, { @@ -79185,27 +79185,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 849.7421875, - "completions/mean_terminated_length": 816.0562133789062, - "completions/min_length": 40.0, - "completions/min_terminated_length": 40.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1155.177734375, + "completions/mean_terminated_length": 1097.63623046875, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, "epoch": 0.932320559870274, - "grad_norm": 1.5970714092254639, - "kl": 5.6484375, - "learning_rate": 1.1258731284452616e-07, - "loss": 0.3587, - "num_tokens": 1452981492.0, - "reward": 1.85205078125, - "reward_std": 0.49832069873809814, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.19379454851150513, + "grad_norm": 2.86479115486145, + "kl": 2.783203125, + "learning_rate": 1.1259682015252397e-07, + "loss": 0.1178, + "num_tokens": 1571467504.0, + "reward": 1.13916015625, + "reward_std": 0.5090773105621338, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.15234375, + "rewards/format_reward/std": 0.35970520973205566, + "rewards/tag_count_reward/mean": 0.88720703125, + "rewards/tag_count_reward/std": 0.2214023321866989, "step": 2731 }, { @@ -79214,27 +79214,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2043.0, - "completions/mean_length": 826.978515625, - "completions/mean_terminated_length": 779.9208984375, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1029.146484375, + "completions/mean_terminated_length": 981.2249145507812, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, "epoch": 0.9326619441836648, - "grad_norm": 2.426403760910034, - "kl": 6.5546875, - "learning_rate": 1.1246171072249991e-07, - "loss": 0.4547, - "num_tokens": 1453480857.0, - "reward": 1.90576171875, - "reward_std": 0.588792622089386, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.1958593726158142, + "grad_norm": 1.5832064151763916, + "kl": 2.611328125, + "learning_rate": 1.1247112360732859e-07, + "loss": 0.1164, + "num_tokens": 1572070379.0, + "reward": 1.18212890625, + "reward_std": 0.5231274366378784, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, + "rewards/format_reward/mean": 0.13671875, + "rewards/format_reward/std": 0.3438861668109894, + "rewards/tag_count_reward/mean": 0.90283203125, + "rewards/tag_count_reward/std": 0.21205542981624603, "step": 2732 }, { @@ -79243,27 +79243,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 863.044921875, - "completions/mean_terminated_length": 822.3495483398438, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1109.66015625, + "completions/mean_terminated_length": 1053.3209228515625, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, "epoch": 0.9330033284970556, - "grad_norm": 1.839331865310669, - "kl": 4.6171875, - "learning_rate": 1.1233672960604836e-07, - "loss": 0.298, - "num_tokens": 1453993808.0, - "reward": 1.9404296875, - "reward_std": 0.48437240719795227, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.9482421875, - "rewards/tag_count_reward/std": 0.16480088233947754, + "grad_norm": 1.6788402795791626, + "kl": 2.033203125, + "learning_rate": 1.1234604852559989e-07, + "loss": 0.0908, + "num_tokens": 1572709597.0, + "reward": 1.19482421875, + "reward_std": 0.5316367745399475, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.169921875, + "rewards/format_reward/std": 0.3759314715862274, + "rewards/tag_count_reward/mean": 0.90380859375, + "rewards/tag_count_reward/std": 0.19634664058685303, "step": 2733 }, { @@ -79272,27 +79272,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 796.869140625, - "completions/mean_terminated_length": 735.3380737304688, - "completions/min_length": 88.0, - "completions/min_terminated_length": 88.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 1055.7265625, + "completions/mean_terminated_length": 982.918212890625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.9333447128104464, - "grad_norm": 1.7125744819641113, - "kl": 6.4375, - "learning_rate": 1.1221236967255949e-07, - "loss": 0.4165, - "num_tokens": 1454486061.0, - "reward": 1.8779296875, - "reward_std": 0.5492465496063232, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.21659238636493683, + "grad_norm": 2.4923200607299805, + "kl": 2.9921875, + "learning_rate": 1.122215950849939e-07, + "loss": 0.142, + "num_tokens": 1573334385.0, + "reward": 1.18798828125, + "reward_std": 0.48628371953964233, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.16015625, + "rewards/format_reward/std": 0.3671095669269562, + "rewards/tag_count_reward/mean": 0.88720703125, + "rewards/tag_count_reward/std": 0.21973879635334015, "step": 2734 }, { @@ -79301,27 +79301,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1883.0, - "completions/mean_length": 861.9921875, - "completions/mean_terminated_length": 798.543212890625, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1067.77734375, + "completions/mean_terminated_length": 1013.2083129882812, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.9336860971238372, - "grad_norm": 1.4921162128448486, - "kl": 6.5390625, - "learning_rate": 1.1208863109853938e-07, - "loss": 0.431, - "num_tokens": 1455001737.0, - "reward": 1.87255859375, - "reward_std": 0.5436522960662842, - "rewards/accuracy_reward/mean": 0.07258064299821854, - "rewards/accuracy_reward/std": 0.25970885157585144, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.19726912677288055, + "grad_norm": 1.6817021369934082, + "kl": 2.51171875, + "learning_rate": 1.1209776346228351e-07, + "loss": 0.0897, + "num_tokens": 1573955423.0, + "reward": 1.19482421875, + "reward_std": 0.5526976585388184, + "rewards/accuracy_reward/mean": 0.1088709682226181, + "rewards/accuracy_reward/std": 0.31179171800613403, + "rewards/format_reward/mean": 0.201171875, + "rewards/format_reward/std": 0.4012683033943176, + "rewards/tag_count_reward/mean": 0.88818359375, + "rewards/tag_count_reward/std": 0.2207929641008377, "step": 2735 }, { @@ -79330,27 +79330,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 845.359375, - "completions/mean_terminated_length": 804.0565795898438, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1046.78515625, + "completions/mean_terminated_length": 1001.8325805664062, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, "epoch": 0.9340274814372279, - "grad_norm": 1.247635006904602, - "kl": 6.078125, - "learning_rate": 1.1196551405961232e-07, - "loss": 0.3962, - "num_tokens": 1455508961.0, - "reward": 1.91943359375, - "reward_std": 0.5020928978919983, + "grad_norm": 2.0370399951934814, + "kl": 2.21484375, + "learning_rate": 1.1197455383335848e-07, + "loss": 0.1041, + "num_tokens": 1574565777.0, + "reward": 1.17138671875, + "reward_std": 0.49347296357154846, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.1798572540283203, + "rewards/format_reward/mean": 0.1640625, + "rewards/format_reward/std": 0.37069445848464966, + "rewards/tag_count_reward/mean": 0.90771484375, + "rewards/tag_count_reward/std": 0.20369400084018707, "step": 2736 }, { @@ -79359,27 +79359,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 838.4921875, - "completions/mean_terminated_length": 786.7617797851562, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1053.767578125, + "completions/mean_terminated_length": 998.4185791015625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, "epoch": 0.9343688657506187, - "grad_norm": 1.1580220460891724, - "kl": 7.0859375, - "learning_rate": 1.1184301873052052e-07, - "loss": 0.4467, - "num_tokens": 1456012653.0, - "reward": 1.85205078125, - "reward_std": 0.5236070156097412, - "rewards/accuracy_reward/mean": 0.06653226166963577, - "rewards/accuracy_reward/std": 0.2494617998600006, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.19674043357372284, + "grad_norm": 1.7936569452285767, + "kl": 2.51171875, + "learning_rate": 1.11851966373225e-07, + "loss": 0.1205, + "num_tokens": 1575179690.0, + "reward": 1.13525390625, + "reward_std": 0.49340930581092834, + "rewards/accuracy_reward/mean": 0.11088709533214569, + "rewards/accuracy_reward/std": 0.3143092691898346, + "rewards/format_reward/mean": 0.134765625, + "rewards/format_reward/std": 0.3418070077896118, + "rewards/tag_count_reward/mean": 0.89306640625, + "rewards/tag_count_reward/std": 0.20963992178440094, "step": 2737 }, { @@ -79388,27 +79388,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 830.982421875, - "completions/mean_terminated_length": 791.7237548828125, - "completions/min_length": 95.0, - "completions/min_terminated_length": 95.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1107.134765625, + "completions/mean_terminated_length": 1029.55810546875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, "epoch": 0.9347102500640095, - "grad_norm": 1.6010982990264893, - "kl": 6.40625, - "learning_rate": 1.1172114528512358e-07, - "loss": 0.3609, - "num_tokens": 1456520548.0, - "reward": 1.85693359375, - "reward_std": 0.5425029397010803, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.2022298127412796, + "grad_norm": 3.862917900085449, + "kl": 2.65625, + "learning_rate": 1.1173000125600561e-07, + "loss": 0.1732, + "num_tokens": 1575828975.0, + "reward": 1.17724609375, + "reward_std": 0.5306399464607239, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.1640625, + "rewards/format_reward/std": 0.37069445848464966, + "rewards/tag_count_reward/mean": 0.88427734375, + "rewards/tag_count_reward/std": 0.2328498363494873, "step": 2738 }, { @@ -79417,27 +79417,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1922.0, - "completions/mean_length": 824.310546875, - "completions/mean_terminated_length": 774.5670166015625, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1073.4140625, + "completions/mean_terminated_length": 1010.6029052734375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.9350516343774004, - "grad_norm": 1.4851871728897095, - "kl": 8.1484375, - "learning_rate": 1.115998938963986e-07, - "loss": 0.4943, - "num_tokens": 1457016883.0, - "reward": 1.83740234375, - "reward_std": 0.5193938612937927, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.1916077733039856, + "grad_norm": 2.2169547080993652, + "kl": 2.798828125, + "learning_rate": 1.1160865865493885e-07, + "loss": 0.1225, + "num_tokens": 1576452851.0, + "reward": 1.13037109375, + "reward_std": 0.5240286588668823, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.189453125, + "rewards/format_reward/std": 0.3922513723373413, + "rewards/tag_count_reward/mean": 0.87646484375, + "rewards/tag_count_reward/std": 0.22932685911655426, "step": 2739 }, { @@ -79446,27 +79446,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 784.787109375, - "completions/mean_terminated_length": 757.0518798828125, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1095.41015625, + "completions/mean_terminated_length": 1010.2850952148438, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.9353930186907912, - "grad_norm": 1.6466114521026611, - "kl": 6.6484375, - "learning_rate": 1.1147926473643973e-07, - "loss": 0.4206, - "num_tokens": 1457500262.0, - "reward": 1.865234375, - "reward_std": 0.4733615517616272, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.20127369463443756, + "grad_norm": 2.004274606704712, + "kl": 2.70703125, + "learning_rate": 1.1148793874237905e-07, + "loss": 0.1281, + "num_tokens": 1577095269.0, + "reward": 1.14111328125, + "reward_std": 0.5007473230361938, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.16796875, + "rewards/format_reward/std": 0.374204158782959, + "rewards/tag_count_reward/mean": 0.88134765625, + "rewards/tag_count_reward/std": 0.22977641224861145, "step": 2740 }, { @@ -79475,27 +79475,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 805.1640625, - "completions/mean_terminated_length": 772.7855834960938, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, - "epoch": 0.935734403004182, - "grad_norm": 1.5594877004623413, - "kl": 5.7890625, - "learning_rate": 1.1135925797645812e-07, - "loss": 0.3914, - "num_tokens": 1457985050.0, - "reward": 1.91943359375, - "reward_std": 0.5162756443023682, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.1750049889087677, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1053.6796875, + "completions/mean_terminated_length": 1000.485595703125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.935734403004182, + "grad_norm": 3.298950433731079, + "kl": 3.109375, + "learning_rate": 1.1136784168979604e-07, + "loss": 0.1366, + "num_tokens": 1577707297.0, + "reward": 1.212890625, + "reward_std": 0.536832869052887, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.1953125, + "rewards/format_reward/std": 0.3968288004398346, + "rewards/tag_count_reward/mean": 0.888671875, + "rewards/tag_count_reward/std": 0.22020848095417023, "step": 2741 }, { @@ -79504,27 +79504,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 797.59765625, - "completions/mean_terminated_length": 759.859130859375, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1070.462890625, + "completions/mean_terminated_length": 987.6207885742188, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.9360757873175728, - "grad_norm": 1.6687159538269043, - "kl": 5.03515625, - "learning_rate": 1.1123987378678127e-07, - "loss": 0.3125, - "num_tokens": 1458470924.0, - "reward": 1.90478515625, - "reward_std": 0.4768308401107788, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.94384765625, - "rewards/tag_count_reward/std": 0.17527233064174652, + "grad_norm": 2.6163384914398193, + "kl": 2.541015625, + "learning_rate": 1.1124836766777502e-07, + "loss": 0.1166, + "num_tokens": 1578332878.0, + "reward": 1.18310546875, + "reward_std": 0.5336445569992065, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.1875, + "rewards/format_reward/std": 0.39069411158561707, + "rewards/tag_count_reward/mean": 0.87841796875, + "rewards/tag_count_reward/std": 0.22930601239204407, "step": 2742 }, { @@ -79533,27 +79533,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1958.0, - "completions/mean_length": 773.404296875, - "completions/mean_terminated_length": 755.7366333007812, - "completions/min_length": 89.0, - "completions/min_terminated_length": 89.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1067.251953125, + "completions/mean_terminated_length": 1008.366455078125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, "epoch": 0.9364171716309636, - "grad_norm": 1.5119606256484985, - "kl": 4.30078125, - "learning_rate": 1.1112111233685323e-07, - "loss": 0.2805, - "num_tokens": 1458948491.0, - "reward": 1.953125, - "reward_std": 0.43765851855278015, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.919921875, - "rewards/format_reward/std": 0.271679550409317, - "rewards/tag_count_reward/mean": 0.958984375, - "rewards/tag_count_reward/std": 0.15253034234046936, + "grad_norm": 2.0988738536834717, + "kl": 3.109375, + "learning_rate": 1.1112951684601616e-07, + "loss": 0.1381, + "num_tokens": 1578960895.0, + "reward": 1.27490234375, + "reward_std": 0.6136878728866577, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.244140625, + "rewards/format_reward/std": 0.42999663949012756, + "rewards/tag_count_reward/mean": 0.89208984375, + "rewards/tag_count_reward/std": 0.22546088695526123, "step": 2743 }, { @@ -79562,27 +79562,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 845.7890625, - "completions/mean_terminated_length": 804.5010375976562, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1039.001953125, + "completions/mean_terminated_length": 980.630126953125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, "epoch": 0.9367585559443543, - "grad_norm": 1.2072147130966187, - "kl": 5.609375, - "learning_rate": 1.1100297379523423e-07, - "loss": 0.3519, - "num_tokens": 1459464143.0, - "reward": 1.951171875, - "reward_std": 0.4601624608039856, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.951171875, - "rewards/tag_count_reward/std": 0.15814046561717987, + "grad_norm": 2.491743564605713, + "kl": 2.580078125, + "learning_rate": 1.1101128939333448e-07, + "loss": 0.1241, + "num_tokens": 1579575472.0, + "reward": 1.25390625, + "reward_std": 0.5565868616104126, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, + "rewards/format_reward/mean": 0.1875, + "rewards/format_reward/std": 0.39069411158561707, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.2085207849740982, "step": 2744 }, { @@ -79591,27 +79591,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1868.0, - "completions/mean_length": 787.537109375, - "completions/mean_terminated_length": 752.1023559570312, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1001.1796875, + "completions/mean_terminated_length": 967.4112548828125, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, "epoch": 0.9370999402577451, - "grad_norm": 1.7770161628723145, - "kl": 6.6796875, - "learning_rate": 1.108854583296002e-07, - "loss": 0.4191, - "num_tokens": 1459940114.0, - "reward": 1.88330078125, - "reward_std": 0.5040788650512695, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.18620555102825165, + "grad_norm": 4.4087700843811035, + "kl": 2.06640625, + "learning_rate": 1.1089368547765956e-07, + "loss": 0.0792, + "num_tokens": 1580160828.0, + "reward": 1.203125, + "reward_std": 0.5256979465484619, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.177734375, + "rewards/format_reward/std": 0.3826628625392914, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.2017289400100708, "step": 2745 }, { @@ -79620,27 +79620,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 814.953125, - "completions/mean_terminated_length": 767.4320068359375, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1098.75390625, + "completions/mean_terminated_length": 993.7396850585938, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.9374413245711359, - "grad_norm": 1.4414271116256714, - "kl": 6.1171875, - "learning_rate": 1.1076856610674298e-07, - "loss": 0.3417, - "num_tokens": 1460450394.0, - "reward": 1.826171875, - "reward_std": 0.5320627689361572, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.186283141374588, + "grad_norm": 2.1992671489715576, + "kl": 3.25390625, + "learning_rate": 1.1077670526603537e-07, + "loss": 0.1646, + "num_tokens": 1580816414.0, + "reward": 1.11669921875, + "reward_std": 0.5224183201789856, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.189453125, + "rewards/format_reward/std": 0.3922513723373413, + "rewards/tag_count_reward/mean": 0.87060546875, + "rewards/tag_count_reward/std": 0.23404096066951752, "step": 2746 }, { @@ -79649,27 +79649,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 867.416015625, - "completions/mean_terminated_length": 829.3326416015625, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1102.56640625, + "completions/mean_terminated_length": 1033.1949462890625, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.9377827088845268, - "grad_norm": 1.8329652547836304, - "kl": 6.3359375, - "learning_rate": 1.106522972925696e-07, - "loss": 0.366, - "num_tokens": 1460977279.0, - "reward": 1.865234375, - "reward_std": 0.47398167848587036, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.18036192655563354, + "grad_norm": 2.437163829803467, + "kl": 2.75390625, + "learning_rate": 1.1066034892461983e-07, + "loss": 0.1322, + "num_tokens": 1581463696.0, + "reward": 1.146484375, + "reward_std": 0.5082423686981201, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.203125, + "rewards/format_reward/std": 0.4027182459831238, + "rewards/tag_count_reward/mean": 0.884765625, + "rewards/tag_count_reward/std": 0.22317391633987427, "step": 2747 }, { @@ -79680,25 +79680,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 873.8671875, - "completions/mean_terminated_length": 828.6166381835938, - "completions/min_length": 216.0, - "completions/min_terminated_length": 216.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1058.5859375, + "completions/mean_terminated_length": 1020.454345703125, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, "epoch": 0.9381240931979176, - "grad_norm": 1.8618098497390747, - "kl": 7.3359375, - "learning_rate": 1.1053665205210249e-07, - "loss": 0.4809, - "num_tokens": 1461494891.0, - "reward": 1.82421875, - "reward_std": 0.5058585405349731, - "rewards/accuracy_reward/mean": 0.01953125, - "rewards/accuracy_reward/std": 0.1385180652141571, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.19210147857666016, + "grad_norm": 4.850437641143799, + "kl": 2.16015625, + "learning_rate": 1.105446166186849e-07, + "loss": 0.0444, + "num_tokens": 1582075884.0, + "reward": 1.18408203125, + "reward_std": 0.5133605003356934, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.220703125, + "rewards/format_reward/std": 0.4151262938976288, + "rewards/tag_count_reward/mean": 0.90869140625, + "rewards/tag_count_reward/std": 0.20111636817455292, "step": 2748 }, { @@ -79707,27 +79707,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 793.55078125, - "completions/mean_terminated_length": 768.561767578125, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1021.609375, + "completions/mean_terminated_length": 973.3333129882812, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, "epoch": 0.9384654775113084, - "grad_norm": 1.2260710000991821, - "kl": 5.04296875, - "learning_rate": 1.1042163054947881e-07, - "loss": 0.3005, - "num_tokens": 1461975909.0, - "reward": 1.92529296875, - "reward_std": 0.45410025119781494, - "rewards/accuracy_reward/mean": 0.08541666716337204, - "rewards/accuracy_reward/std": 0.27979233860969543, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.95068359375, - "rewards/tag_count_reward/std": 0.15837518870830536, + "grad_norm": 3.285113573074341, + "kl": 3.16796875, + "learning_rate": 1.1042950851261594e-07, + "loss": 0.1494, + "num_tokens": 1582673668.0, + "reward": 1.16943359375, + "reward_std": 0.527377724647522, + "rewards/accuracy_reward/mean": 0.12083332985639572, + "rewards/accuracy_reward/std": 0.32627353072166443, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.3776407241821289, + "rewards/tag_count_reward/mean": 0.88427734375, + "rewards/tag_count_reward/std": 0.21764588356018066, "step": 2749 }, { @@ -79736,27 +79736,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.095703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1871.0, - "completions/mean_length": 875.1796875, - "completions/mean_terminated_length": 827.5040283203125, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1115.119140625, + "completions/mean_terminated_length": 1016.3909301757812, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.9388068618246992, - "grad_norm": 1.4115797281265259, - "kl": 5.65625, - "learning_rate": 1.1030723294795055e-07, - "loss": 0.3572, - "num_tokens": 1462501089.0, - "reward": 1.83935546875, - "reward_std": 0.4400091767311096, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.1865234375, + "grad_norm": 3.8029723167419434, + "kl": 2.419921875, + "learning_rate": 1.1031502476991205e-07, + "loss": 0.11, + "num_tokens": 1583321697.0, + "reward": 1.17822265625, + "reward_std": 0.5240421295166016, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.220703125, + "rewards/format_reward/std": 0.4151262938976288, + "rewards/tag_count_reward/mean": 0.87158203125, + "rewards/tag_count_reward/std": 0.2271624505519867, "step": 2750 }, { @@ -79765,27 +79765,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 836.431640625, - "completions/mean_terminated_length": 779.44580078125, - "completions/min_length": 60.0, - "completions/min_terminated_length": 60.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1056.69140625, + "completions/mean_terminated_length": 979.4736328125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.93914824613809, - "grad_norm": 2.2590596675872803, - "kl": 7.5234375, - "learning_rate": 1.1019345940988427e-07, - "loss": 0.4358, - "num_tokens": 1463009102.0, - "reward": 1.82275390625, - "reward_std": 0.4858624339103699, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.19631743431091309, + "grad_norm": 3.1952877044677734, + "kl": 3.875, + "learning_rate": 1.1020116555318504e-07, + "loss": 0.2257, + "num_tokens": 1583942483.0, + "reward": 1.1064453125, + "reward_std": 0.49217334389686584, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.158203125, + "rewards/format_reward/std": 0.36528825759887695, + "rewards/tag_count_reward/mean": 0.8583984375, + "rewards/tag_count_reward/std": 0.2432408630847931, "step": 2751 }, { @@ -79794,27 +79794,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 906.3359375, - "completions/mean_terminated_length": 862.336669921875, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1151.240234375, + "completions/mean_terminated_length": 1079.3480224609375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, "epoch": 0.9394896304514807, - "grad_norm": 1.9747235774993896, - "kl": 6.8671875, - "learning_rate": 1.1008031009676061e-07, - "loss": 0.4171, - "num_tokens": 1463562954.0, - "reward": 1.8486328125, - "reward_std": 0.5552176833152771, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.19495287537574768, + "grad_norm": 1.4473960399627686, + "kl": 2.91796875, + "learning_rate": 1.1008793102416005e-07, + "loss": 0.1289, + "num_tokens": 1584621726.0, + "reward": 1.2216796875, + "reward_std": 0.5785197615623474, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.25390625, + "rewards/format_reward/std": 0.43567025661468506, + "rewards/tag_count_reward/mean": 0.8818359375, + "rewards/tag_count_reward/std": 0.22654588520526886, "step": 2752 }, { @@ -79823,27 +79823,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 760.125, - "completions/mean_terminated_length": 734.4701538085938, - "completions/min_length": 58.0, - "completions/min_terminated_length": 58.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 975.556640625, + "completions/mean_terminated_length": 936.4797973632812, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.9398310147648715, - "grad_norm": 2.940276861190796, - "kl": 4.484375, - "learning_rate": 1.0996778516917438e-07, - "loss": 0.3332, - "num_tokens": 1464033370.0, - "reward": 1.95849609375, - "reward_std": 0.47670936584472656, - "rewards/accuracy_reward/mean": 0.107421875, - "rewards/accuracy_reward/std": 0.30995169281959534, - "rewards/format_reward/mean": 0.90234375, - "rewards/format_reward/std": 0.29713961482048035, - "rewards/tag_count_reward/mean": 0.94873046875, - "rewards/tag_count_reward/std": 0.16308951377868652, + "grad_norm": 4.085820198059082, + "kl": 2.693359375, + "learning_rate": 1.0997532134367466e-07, + "loss": 0.1461, + "num_tokens": 1585202443.0, + "reward": 1.29345703125, + "reward_std": 0.54472815990448, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.23046875, + "rewards/format_reward/std": 0.42154473066329956, + "rewards/tag_count_reward/mean": 0.90478515625, + "rewards/tag_count_reward/std": 0.21121110022068024, "step": 2753 }, { @@ -79852,27 +79852,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 796.9375, - "completions/mean_terminated_length": 764.3447265625, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 980.50390625, + "completions/mean_terminated_length": 946.0685424804688, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, "epoch": 0.9401723990782623, - "grad_norm": 1.0292353630065918, - "kl": 5.05859375, - "learning_rate": 1.0985588478683407e-07, - "loss": 0.3041, - "num_tokens": 1464517642.0, - "reward": 1.884765625, - "reward_std": 0.5098904371261597, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.17693878710269928, + "grad_norm": 3.333768129348755, + "kl": 2.78515625, + "learning_rate": 1.0986333667167905e-07, + "loss": 0.0896, + "num_tokens": 1585780701.0, + "reward": 1.2021484375, + "reward_std": 0.5328149199485779, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.203125, + "rewards/format_reward/std": 0.4027182459831238, + "rewards/tag_count_reward/mean": 0.8955078125, + "rewards/tag_count_reward/std": 0.21346396207809448, "step": 2754 }, { @@ -79881,27 +79881,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1859.0, - "completions/mean_length": 822.7265625, - "completions/mean_terminated_length": 770.3218383789062, - "completions/min_length": 114.0, - "completions/min_terminated_length": 114.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1016.177734375, + "completions/mean_terminated_length": 967.6461791992188, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, "epoch": 0.9405137833916531, - "grad_norm": 1.6634807586669922, - "kl": 5.89453125, - "learning_rate": 1.0974460910856182e-07, - "loss": 0.394, - "num_tokens": 1465017150.0, - "reward": 1.86083984375, - "reward_std": 0.5335161089897156, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.18773873150348663, + "grad_norm": 3.897737503051758, + "kl": 2.857421875, + "learning_rate": 1.0975197716723546e-07, + "loss": 0.1074, + "num_tokens": 1586379256.0, + "reward": 1.23828125, + "reward_std": 0.5745152831077576, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.22265625, + "rewards/format_reward/std": 0.41643625497817993, + "rewards/tag_count_reward/mean": 0.892578125, + "rewards/tag_count_reward/std": 0.21026401221752167, "step": 2755 }, { @@ -79910,27 +79910,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, - "completions/mean_length": 865.8359375, - "completions/mean_terminated_length": 810.2330932617188, - "completions/min_length": 88.0, - "completions/min_terminated_length": 88.0, + "completions/mean_length": 1052.517578125, + "completions/mean_terminated_length": 1012.05078125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.940855167705044, - "grad_norm": 1.9925856590270996, - "kl": 6.6953125, - "learning_rate": 1.0963395829229322e-07, - "loss": 0.3993, - "num_tokens": 1465539626.0, - "reward": 1.892578125, - "reward_std": 0.5888175964355469, - "rewards/accuracy_reward/mean": 0.1484375, - "rewards/accuracy_reward/std": 0.35588082671165466, - "rewards/format_reward/mean": 0.830078125, - "rewards/format_reward/std": 0.3759314715862274, - "rewards/tag_count_reward/mean": 0.9140625, - "rewards/tag_count_reward/std": 0.19891461730003357, + "grad_norm": 3.8902318477630615, + "kl": 2.205078125, + "learning_rate": 1.0964124298851851e-07, + "loss": 0.1018, + "num_tokens": 1586997313.0, + "reward": 1.34619140625, + "reward_std": 0.5808758735656738, + "rewards/accuracy_reward/mean": 0.197265625, + "rewards/accuracy_reward/std": 0.3983237147331238, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.43343618512153625, + "rewards/tag_count_reward/mean": 0.89892578125, + "rewards/tag_count_reward/std": 0.20905111730098724, "step": 2756 }, { @@ -79939,27 +79939,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 822.06640625, - "completions/mean_terminated_length": 787.6023559570312, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1041.39453125, + "completions/mean_terminated_length": 983.1611328125, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, "epoch": 0.9411965520184348, - "grad_norm": 1.9566264152526855, - "kl": 4.609375, - "learning_rate": 1.0952393249507669e-07, - "loss": 0.2954, - "num_tokens": 1466041836.0, - "reward": 1.904296875, - "reward_std": 0.5156567096710205, - "rewards/accuracy_reward/mean": 0.1171875, - "rewards/accuracy_reward/std": 0.32195815443992615, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.1834714710712433, + "grad_norm": 2.0232720375061035, + "kl": 3.09375, + "learning_rate": 1.0953113429281422e-07, + "loss": 0.1605, + "num_tokens": 1587611819.0, + "reward": 1.25, + "reward_std": 0.5445467233657837, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, + "rewards/format_reward/mean": 0.19140625, + "rewards/format_reward/std": 0.3937928080558777, + "rewards/tag_count_reward/mean": 0.896484375, + "rewards/tag_count_reward/std": 0.2057807892560959, "step": 2757 }, { @@ -79968,27 +79968,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1950.0, - "completions/mean_length": 872.8046875, - "completions/mean_terminated_length": 804.8181762695312, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1053.44921875, + "completions/mean_terminated_length": 980.4737548828125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.9415379363318256, - "grad_norm": 2.669552803039551, - "kl": 6.8671875, - "learning_rate": 1.0941453187307386e-07, - "loss": 0.4291, - "num_tokens": 1466568136.0, - "reward": 1.779296875, - "reward_std": 0.5448940992355347, - "rewards/accuracy_reward/mean": 0.04032257944345474, - "rewards/accuracy_reward/std": 0.19691328704357147, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.912109375, - "rewards/tag_count_reward/std": 0.20592933893203735, + "grad_norm": 2.105117082595825, + "kl": 3.0703125, + "learning_rate": 1.0942165123652037e-07, + "loss": 0.1172, + "num_tokens": 1588230609.0, + "reward": 1.1865234375, + "reward_std": 0.5329842567443848, + "rewards/accuracy_reward/mean": 0.07258064299821854, + "rewards/accuracy_reward/std": 0.25970885157585144, + "rewards/format_reward/mean": 0.23046875, + "rewards/format_reward/std": 0.42154473066329956, + "rewards/tag_count_reward/mean": 0.8857421875, + "rewards/tag_count_reward/std": 0.22476740181446075, "step": 2758 }, { @@ -79999,25 +79999,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 886.259765625, - "completions/mean_terminated_length": 816.5072631835938, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1088.666015625, + "completions/mean_terminated_length": 1031.0662841796875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, "epoch": 0.9418793206452164, - "grad_norm": 4.988946914672852, - "kl": 8.734375, - "learning_rate": 1.0930575658155882e-07, - "loss": 0.5076, - "num_tokens": 1467114845.0, - "reward": 1.67578125, - "reward_std": 0.6371972560882568, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.76171875, - "rewards/format_reward/std": 0.42644867300987244, - "rewards/tag_count_reward/mean": 0.87109375, - "rewards/tag_count_reward/std": 0.2381935715675354, + "grad_norm": 6.0164408683776855, + "kl": 2.8671875, + "learning_rate": 1.0931279397514603e-07, + "loss": 0.0856, + "num_tokens": 1588880950.0, + "reward": 1.1689453125, + "reward_std": 0.5427096486091614, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.212890625, + "rewards/format_reward/std": 0.409751296043396, + "rewards/tag_count_reward/mean": 0.8818359375, + "rewards/tag_count_reward/std": 0.2137681394815445, "step": 2759 }, { @@ -80026,27 +80026,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 797.75390625, - "completions/mean_terminated_length": 749.5699462890625, - "completions/min_length": 9.0, - "completions/min_terminated_length": 9.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1085.111328125, + "completions/mean_terminated_length": 1029.406982421875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.9422207049586071, - "grad_norm": 1.230309247970581, - "kl": 6.7265625, - "learning_rate": 1.0919760677491827e-07, - "loss": 0.4447, - "num_tokens": 1467599647.0, - "reward": 1.82666015625, - "reward_std": 0.5500995516777039, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.20284785330295563, + "grad_norm": 4.037320613861084, + "kl": 2.6484375, + "learning_rate": 1.0920456266331154e-07, + "loss": 0.0629, + "num_tokens": 1589512879.0, + "reward": 1.26806640625, + "reward_std": 0.6067713499069214, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.2578125, + "rewards/format_reward/std": 0.43785804510116577, + "rewards/tag_count_reward/mean": 0.89501953125, + "rewards/tag_count_reward/std": 0.21972140669822693, "step": 2760 }, { @@ -80055,27 +80055,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1950.0, - "completions/mean_length": 878.380859375, - "completions/mean_terminated_length": 810.7169189453125, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 1106.484375, + "completions/mean_terminated_length": 1056.115234375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.9425620892719979, - "grad_norm": 2.21763277053833, - "kl": 7.4765625, - "learning_rate": 1.0909008260665102e-07, - "loss": 0.4783, - "num_tokens": 1468138690.0, - "reward": 1.76171875, - "reward_std": 0.5877810120582581, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.82421875, - "rewards/format_reward/std": 0.3810062110424042, - "rewards/tag_count_reward/mean": 0.904296875, - "rewards/tag_count_reward/std": 0.2236529439687729, + "grad_norm": 3.344714403152466, + "kl": 3.28515625, + "learning_rate": 1.0909695745474783e-07, + "loss": 0.1226, + "num_tokens": 1590168711.0, + "reward": 1.1669921875, + "reward_std": 0.5766034126281738, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.25390625, + "rewards/format_reward/std": 0.43567025661468506, + "rewards/tag_count_reward/mean": 0.8583984375, + "rewards/tag_count_reward/std": 0.2427375167608261, "step": 2761 }, { @@ -80084,27 +80084,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 834.28125, - "completions/mean_terminated_length": 782.3707275390625, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1011.427734375, + "completions/mean_terminated_length": 971.4786987304688, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, "epoch": 0.9429034735853887, - "grad_norm": 2.670936107635498, - "kl": 6.578125, - "learning_rate": 1.0898318422936796e-07, - "loss": 0.4466, - "num_tokens": 1468640258.0, - "reward": 1.84130859375, - "reward_std": 0.5695170164108276, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.91748046875, - "rewards/tag_count_reward/std": 0.2012730985879898, + "grad_norm": 1.6095952987670898, + "kl": 2.8359375, + "learning_rate": 1.0898997850229693e-07, + "loss": 0.1003, + "num_tokens": 1590760978.0, + "reward": 1.25634765625, + "reward_std": 0.567395031452179, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.234375, + "rewards/format_reward/std": 0.42402184009552, + "rewards/tag_count_reward/mean": 0.89501953125, + "rewards/tag_count_reward/std": 0.21804504096508026, "step": 2762 }, { @@ -80113,27 +80113,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 867.34765625, - "completions/mean_terminated_length": 816.8513793945312, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1061.474609375, + "completions/mean_terminated_length": 1021.3718872070312, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, "epoch": 0.9432448578987795, - "grad_norm": 1.1016266345977783, - "kl": 5.6484375, - "learning_rate": 1.0887691179479182e-07, - "loss": 0.3679, - "num_tokens": 1469162836.0, - "reward": 1.85888671875, - "reward_std": 0.5383203625679016, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19823069870471954, + "grad_norm": 3.225815773010254, + "kl": 2.94921875, + "learning_rate": 1.0888362595791095e-07, + "loss": 0.0884, + "num_tokens": 1591382949.0, + "reward": 1.2587890625, + "reward_std": 0.5945134162902832, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.275390625, + "rewards/format_reward/std": 0.44714778661727905, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.2108335644006729, "step": 2763 }, { @@ -80142,27 +80142,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 818.064453125, - "completions/mean_terminated_length": 775.8242797851562, - "completions/min_length": 44.0, - "completions/min_terminated_length": 44.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 998.7890625, + "completions/mean_terminated_length": 944.9281616210938, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, "epoch": 0.9435862422121704, - "grad_norm": 4.187354564666748, - "kl": 4.8046875, - "learning_rate": 1.0877126545375688e-07, - "loss": 0.3482, - "num_tokens": 1469657781.0, - "reward": 1.91357421875, - "reward_std": 0.5429072976112366, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.18766237795352936, + "grad_norm": 3.073822498321533, + "kl": 2.8203125, + "learning_rate": 1.0877789997265255e-07, + "loss": 0.0874, + "num_tokens": 1591970425.0, + "reward": 1.23876953125, + "reward_std": 0.5180349349975586, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.21875, + "rewards/format_reward/std": 0.41380295157432556, + "rewards/tag_count_reward/mean": 0.90283203125, + "rewards/tag_count_reward/std": 0.20140598714351654, "step": 2764 }, { @@ -80171,27 +80171,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1930.0, - "completions/mean_length": 818.171875, - "completions/mean_terminated_length": 762.955078125, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1043.767578125, + "completions/mean_terminated_length": 974.5824584960938, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, "epoch": 0.9439276265255612, - "grad_norm": 2.6798832416534424, - "kl": 5.875, - "learning_rate": 1.0866624535620878e-07, - "loss": 0.4086, - "num_tokens": 1470150269.0, - "reward": 1.81982421875, - "reward_std": 0.5035111904144287, - "rewards/accuracy_reward/mean": 0.03515625, - "rewards/accuracy_reward/std": 0.1843547374010086, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.2038721740245819, + "grad_norm": 3.5396907329559326, + "kl": 3.5703125, + "learning_rate": 1.0867280069669415e-07, + "loss": 0.1635, + "num_tokens": 1592578418.0, + "reward": 1.1357421875, + "reward_std": 0.5693184733390808, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.23046875, + "rewards/format_reward/std": 0.42154473066329956, + "rewards/tag_count_reward/mean": 0.8525390625, + "rewards/tag_count_reward/std": 0.24972273409366608, "step": 2765 }, { @@ -80200,27 +80200,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2024.0, - "completions/mean_length": 853.833984375, - "completions/mean_terminated_length": 792.5318603515625, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1045.017578125, + "completions/mean_terminated_length": 989.1814575195312, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, "epoch": 0.944269010838952, - "grad_norm": 2.4273881912231445, - "kl": 6.5546875, - "learning_rate": 1.0856185165120433e-07, - "loss": 0.4685, - "num_tokens": 1470658808.0, - "reward": 1.88623046875, - "reward_std": 0.5840543508529663, - "rewards/accuracy_reward/mean": 0.10546875, - "rewards/accuracy_reward/std": 0.3074568510055542, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.92138671875, - "rewards/tag_count_reward/std": 0.20343582332134247, + "grad_norm": 2.6293399333953857, + "kl": 3.0390625, + "learning_rate": 1.0856832827931831e-07, + "loss": 0.1342, + "num_tokens": 1593184843.0, + "reward": 1.2998046875, + "reward_std": 0.6106535196304321, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.263671875, + "rewards/format_reward/std": 0.4410543739795685, + "rewards/tag_count_reward/mean": 0.8896484375, + "rewards/tag_count_reward/std": 0.22290615737438202, "step": 2766 }, { @@ -80229,27 +80229,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1927.0, - "completions/mean_length": 862.708984375, - "completions/mean_terminated_length": 822.0020751953125, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1011.826171875, + "completions/mean_terminated_length": 982.69677734375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.9446103951523428, - "grad_norm": 1.5315406322479248, - "kl": 6.375, - "learning_rate": 1.0845808448691141e-07, - "loss": 0.4185, - "num_tokens": 1471171075.0, - "reward": 1.84423828125, - "reward_std": 0.5585125684738159, - "rewards/accuracy_reward/mean": 0.06451612710952759, - "rewards/accuracy_reward/std": 0.2459181249141693, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.2030550092458725, + "grad_norm": 2.308115005493164, + "kl": 2.875, + "learning_rate": 1.0846448286891689e-07, + "loss": 0.1141, + "num_tokens": 1593773458.0, + "reward": 1.240234375, + "reward_std": 0.5799919366836548, + "rewards/accuracy_reward/mean": 0.13306452333927155, + "rewards/accuracy_reward/std": 0.3399873673915863, + "rewards/format_reward/mean": 0.22265625, + "rewards/format_reward/std": 0.41643625497817993, + "rewards/tag_count_reward/mean": 0.888671875, + "rewards/tag_count_reward/std": 0.2151515930891037, "step": 2767 }, { @@ -80258,27 +80258,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.052734375, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 822.896484375, - "completions/mean_terminated_length": 754.6948852539062, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1041.376953125, + "completions/mean_terminated_length": 965.245849609375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.9449517794657335, - "grad_norm": 2.03684401512146, - "kl": 6.6171875, - "learning_rate": 1.0835494401060835e-07, - "loss": 0.4486, - "num_tokens": 1471670398.0, - "reward": 1.8759765625, - "reward_std": 0.5379860401153564, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.1911715567111969, + "grad_norm": 1.818924069404602, + "kl": 3.015625, + "learning_rate": 1.0836126461299149e-07, + "loss": 0.1301, + "num_tokens": 1594384643.0, + "reward": 1.26171875, + "reward_std": 0.5729702115058899, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.251953125, + "rewards/format_reward/std": 0.43455907702445984, + "rewards/tag_count_reward/mean": 0.880859375, + "rewards/tag_count_reward/std": 0.22494801878929138, "step": 2768 }, { @@ -80287,27 +80287,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 844.67578125, - "completions/mean_terminated_length": 818.2554931640625, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1101.626953125, + "completions/mean_terminated_length": 1025.75732421875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.9452931637791243, - "grad_norm": 1.6256250143051147, - "kl": 5.1015625, - "learning_rate": 1.0825243036868424e-07, - "loss": 0.319, - "num_tokens": 1472174888.0, - "reward": 1.9091796875, - "reward_std": 0.5101285576820374, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.1701863408088684, + "grad_norm": 2.682219982147217, + "kl": 3.25, + "learning_rate": 1.0825867365815255e-07, + "loss": 0.1575, + "num_tokens": 1595020692.0, + "reward": 1.2353515625, + "reward_std": 0.601554274559021, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.232421875, + "rewards/format_reward/std": 0.42278963327407837, + "rewards/tag_count_reward/mean": 0.8564453125, + "rewards/tag_count_reward/std": 0.24460412561893463, "step": 2769 }, { @@ -80316,27 +80316,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 873.265625, - "completions/mean_terminated_length": 823.0224609375, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1100.462890625, + "completions/mean_terminated_length": 1043.5714111328125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, "epoch": 0.9456345480925151, - "grad_norm": 2.38149356842041, - "kl": 8.0859375, - "learning_rate": 1.081505437066386e-07, - "loss": 0.5097, - "num_tokens": 1472707488.0, - "reward": 1.7958984375, - "reward_std": 0.5217913389205933, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.9189453125, - "rewards/tag_count_reward/std": 0.2015654444694519, + "grad_norm": 2.309095621109009, + "kl": 3.31640625, + "learning_rate": 1.081567101501198e-07, + "loss": 0.1757, + "num_tokens": 1595669617.0, + "reward": 1.18505859375, + "reward_std": 0.5600747466087341, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.25390625, + "rewards/format_reward/std": 0.43567025661468506, + "rewards/tag_count_reward/mean": 0.87255859375, + "rewards/tag_count_reward/std": 0.23563192784786224, "step": 2770 }, { @@ -80345,27 +80345,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 856.62109375, - "completions/mean_terminated_length": 785.0890502929688, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1082.302734375, + "completions/mean_terminated_length": 1011.4443969726562, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.9459759324059059, - "grad_norm": 3.7725250720977783, - "kl": 9.3984375, - "learning_rate": 1.0804928416908073e-07, - "loss": 0.53, - "num_tokens": 1473224142.0, - "reward": 1.75146484375, - "reward_std": 0.6223288774490356, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.89208984375, - "rewards/tag_count_reward/std": 0.24017061293125153, + "grad_norm": 4.329871654510498, + "kl": 3.125, + "learning_rate": 1.0805537423372147e-07, + "loss": 0.1466, + "num_tokens": 1596301820.0, + "reward": 1.20849609375, + "reward_std": 0.5813396573066711, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.23828125, + "rewards/format_reward/std": 0.42644867300987244, + "rewards/tag_count_reward/mean": 0.87451171875, + "rewards/tag_count_reward/std": 0.23039521276950836, "step": 2771 }, { @@ -80374,27 +80374,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 877.8359375, - "completions/mean_terminated_length": 797.21923828125, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1178.52734375, + "completions/mean_terminated_length": 1090.6451416015625, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, "epoch": 0.9463173167192968, - "grad_norm": 5.367483139038086, - "kl": 9.8515625, - "learning_rate": 1.0794865189973011e-07, - "loss": 0.557, - "num_tokens": 1473763466.0, - "reward": 1.74267578125, - "reward_std": 0.5900819301605225, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.806640625, - "rewards/format_reward/std": 0.39531853795051575, - "rewards/tag_count_reward/mean": 0.89306640625, - "rewards/tag_count_reward/std": 0.23285804688930511, + "grad_norm": 2.0258078575134277, + "kl": 3.59375, + "learning_rate": 1.0795466605289464e-07, + "loss": 0.1915, + "num_tokens": 1596995098.0, + "reward": 1.15673828125, + "reward_std": 0.5912457704544067, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.255859375, + "rewards/format_reward/std": 0.43676990270614624, + "rewards/tag_count_reward/mean": 0.85400390625, + "rewards/tag_count_reward/std": 0.25180092453956604, "step": 2772 }, { @@ -80403,27 +80403,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 768.138671875, - "completions/mean_terminated_length": 713.3992309570312, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 953.5625, + "completions/mean_terminated_length": 904.4244384765625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, "epoch": 0.9466587010326876, - "grad_norm": 2.9520204067230225, - "kl": 9.2265625, - "learning_rate": 1.0784864704141585e-07, - "loss": 0.5612, - "num_tokens": 1474234529.0, - "reward": 1.9091796875, - "reward_std": 0.5921919941902161, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.20888477563858032, + "grad_norm": 3.5342416763305664, + "kl": 2.94140625, + "learning_rate": 1.0785458575068455e-07, + "loss": 0.136, + "num_tokens": 1597561098.0, + "reward": 1.2333984375, + "reward_std": 0.5430473685264587, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.193359375, + "rewards/format_reward/std": 0.39531853795051575, + "rewards/tag_count_reward/mean": 0.8876953125, + "rewards/tag_count_reward/std": 0.2163451910018921, "step": 2773 }, { @@ -80432,27 +80432,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 878.41015625, - "completions/mean_terminated_length": 840.681396484375, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1153.875, + "completions/mean_terminated_length": 1071.897705078125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.9470000853460784, - "grad_norm": 3.5780584812164307, - "kl": 8.5078125, - "learning_rate": 1.0774926973607648e-07, - "loss": 0.4941, - "num_tokens": 1474761459.0, - "reward": 1.8134765625, - "reward_std": 0.5407140254974365, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.9228515625, - "rewards/tag_count_reward/std": 0.19821925461292267, + "grad_norm": 2.321030616760254, + "kl": 3.16015625, + "learning_rate": 1.0775513346924465e-07, + "loss": 0.1424, + "num_tokens": 1598229066.0, + "reward": 1.16162109375, + "reward_std": 0.6021189093589783, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.24609375, + "rewards/format_reward/std": 0.4311550557613373, + "rewards/tag_count_reward/mean": 0.83740234375, + "rewards/tag_count_reward/std": 0.2532614767551422, "step": 2774 }, { @@ -80461,27 +80461,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 843.189453125, - "completions/mean_terminated_length": 783.9364013671875, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1071.234375, + "completions/mean_terminated_length": 981.68017578125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, "epoch": 0.9473414696594692, - "grad_norm": 2.634669542312622, - "kl": 9.1328125, - "learning_rate": 1.0765052012475998e-07, - "loss": 0.556, - "num_tokens": 1475264484.0, - "reward": 1.76513671875, - "reward_std": 0.5684515237808228, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.90576171875, - "rewards/tag_count_reward/std": 0.21791352331638336, + "grad_norm": 3.479919910430908, + "kl": 3.70703125, + "learning_rate": 1.0765630934983644e-07, + "loss": 0.2046, + "num_tokens": 1598848850.0, + "reward": 1.12548828125, + "reward_std": 0.5567803382873535, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.216796875, + "rewards/format_reward/std": 0.4124660789966583, + "rewards/tag_count_reward/mean": 0.85205078125, + "rewards/tag_count_reward/std": 0.2491879016160965, "step": 2775 }, { @@ -80490,27 +80490,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 815.4453125, - "completions/mean_terminated_length": 760.1060791015625, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1008.603515625, + "completions/mean_terminated_length": 920.51904296875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, "epoch": 0.9476828539728599, - "grad_norm": 4.417301654815674, - "kl": 10.0625, - "learning_rate": 1.0755239834762326e-07, - "loss": 0.592, - "num_tokens": 1475757720.0, - "reward": 1.83447265625, - "reward_std": 0.5964944362640381, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.91259765625, - "rewards/tag_count_reward/std": 0.21054047346115112, + "grad_norm": 3.2040023803710938, + "kl": 3.359375, + "learning_rate": 1.0755811353282915e-07, + "loss": 0.186, + "num_tokens": 1599440983.0, + "reward": 1.2060546875, + "reward_std": 0.5739580988883972, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.2109375, + "rewards/format_reward/std": 0.4083731174468994, + "rewards/tag_count_reward/mean": 0.8642578125, + "rewards/tag_count_reward/std": 0.24457286298274994, "step": 2776 }, { @@ -80519,27 +80519,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1997.0, - "completions/mean_length": 833.474609375, - "completions/mean_terminated_length": 789.2206420898438, - "completions/min_length": 94.0, - "completions/min_terminated_length": 94.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1097.33984375, + "completions/mean_terminated_length": 1018.95556640625, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, "epoch": 0.9480242382862507, - "grad_norm": 1.6135154962539673, - "kl": 7.328125, - "learning_rate": 1.0745490454393239e-07, - "loss": 0.44, - "num_tokens": 1476266875.0, - "reward": 1.798828125, - "reward_std": 0.5334725975990295, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.19767135381698608, + "grad_norm": 2.2169642448425293, + "kl": 3.494140625, + "learning_rate": 1.0746054615769942e-07, + "loss": 0.1788, + "num_tokens": 1600085237.0, + "reward": 1.1337890625, + "reward_std": 0.5671852827072144, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.205078125, + "rewards/format_reward/std": 0.4041535556316376, + "rewards/tag_count_reward/mean": 0.8544921875, + "rewards/tag_count_reward/std": 0.24193312227725983, "step": 2777 }, { @@ -80548,27 +80548,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 828.712890625, - "completions/mean_terminated_length": 801.942138671875, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1036.962890625, + "completions/mean_terminated_length": 969.5604858398438, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.9483656225996415, - "grad_norm": 1.146044373512268, - "kl": 6.71875, - "learning_rate": 1.0735803885206191e-07, - "loss": 0.4044, - "num_tokens": 1476771064.0, - "reward": 1.87255859375, - "reward_std": 0.5109162330627441, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.17711623013019562, + "grad_norm": 2.867576837539673, + "kl": 2.802734375, + "learning_rate": 1.0736360736303154e-07, + "loss": 0.1205, + "num_tokens": 1600696050.0, + "reward": 1.2236328125, + "reward_std": 0.5877913236618042, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.244140625, + "rewards/format_reward/std": 0.42999663949012756, + "rewards/tag_count_reward/mean": 0.8681640625, + "rewards/tag_count_reward/std": 0.22976237535476685, "step": 2778 }, { @@ -80577,27 +80577,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 828.203125, - "completions/mean_terminated_length": 778.6178588867188, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1051.27734375, + "completions/mean_terminated_length": 1004.396728515625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.9487070069130323, - "grad_norm": 1.207013726234436, - "kl": 7.6796875, - "learning_rate": 1.0726180140949497e-07, - "loss": 0.4986, - "num_tokens": 1477281520.0, - "reward": 1.8486328125, - "reward_std": 0.5367643237113953, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9287109375, - "rewards/tag_count_reward/std": 0.1979491263628006, + "grad_norm": 5.901501655578613, + "kl": 2.703125, + "learning_rate": 1.0726729728651671e-07, + "loss": 0.1447, + "num_tokens": 1601320720.0, + "reward": 1.3369140625, + "reward_std": 0.6024371981620789, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.3046875, + "rewards/format_reward/std": 0.4607250988483429, + "rewards/tag_count_reward/mean": 0.9013671875, + "rewards/tag_count_reward/std": 0.21166585385799408, "step": 2779 }, { @@ -80606,27 +80606,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.06640625, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 868.1015625, - "completions/mean_terminated_length": 784.1757202148438, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1097.009765625, + "completions/mean_terminated_length": 1022.9325561523438, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, "epoch": 0.9490483912264231, - "grad_norm": 1.251905083656311, - "kl": 7.5703125, - "learning_rate": 1.0716619235282295e-07, - "loss": 0.5291, - "num_tokens": 1477806500.0, - "reward": 1.8759765625, - "reward_std": 0.5983998775482178, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.21260276436805725, + "grad_norm": 3.2750980854034424, + "kl": 2.8515625, + "learning_rate": 1.0717161606495317e-07, + "loss": 0.1102, + "num_tokens": 1601962901.0, + "reward": 1.22119140625, + "reward_std": 0.6123496294021606, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.43343618512153625, + "rewards/tag_count_reward/mean": 0.85986328125, + "rewards/tag_count_reward/std": 0.2428334504365921, "step": 2780 }, { @@ -80635,27 +80635,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 833.78125, - "completions/mean_terminated_length": 794.6128540039062, - "completions/min_length": 77.0, - "completions/min_terminated_length": 77.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1058.763671875, + "completions/mean_terminated_length": 988.3995361328125, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.949389775539814, - "grad_norm": 1.1650217771530151, - "kl": 5.9921875, - "learning_rate": 1.0707121181774556e-07, - "loss": 0.3707, - "num_tokens": 1478317156.0, - "reward": 1.81884765625, - "reward_std": 0.5314297676086426, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.2117307335138321, + "grad_norm": 1.626927137374878, + "kl": 3.265625, + "learning_rate": 1.0707656383424609e-07, + "loss": 0.1438, + "num_tokens": 1602588748.0, + "reward": 1.20703125, + "reward_std": 0.5813268423080444, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.240234375, + "rewards/format_reward/std": 0.4276435375213623, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.23009692132472992, "step": 2781 }, { @@ -80664,27 +80664,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 822.16015625, - "completions/mean_terminated_length": 772.3292236328125, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1012.14453125, + "completions/mean_terminated_length": 976.5697631835938, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, "epoch": 0.9497311598532048, - "grad_norm": 2.7264535427093506, - "kl": 6.0546875, - "learning_rate": 1.0697685993907009e-07, - "loss": 0.4079, - "num_tokens": 1478811750.0, - "reward": 1.833984375, - "reward_std": 0.5194734930992126, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.19686728715896606, + "grad_norm": 2.6105847358703613, + "kl": 3.15234375, + "learning_rate": 1.0698214072940701e-07, + "loss": 0.15, + "num_tokens": 1603180614.0, + "reward": 1.21240234375, + "reward_std": 0.5675527453422546, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.23828125, + "rewards/format_reward/std": 0.42644867300987244, + "rewards/tag_count_reward/mean": 0.89404296875, + "rewards/tag_count_reward/std": 0.2209184318780899, "step": 2782 }, { @@ -80693,27 +80693,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 776.794921875, - "completions/mean_terminated_length": 746.2860107421875, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 972.1484375, + "completions/mean_terminated_length": 928.4146118164062, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.9500725441665956, - "grad_norm": 1.6510119438171387, - "kl": 4.10546875, - "learning_rate": 1.0688313685071194e-07, - "loss": 0.2936, - "num_tokens": 1479280301.0, - "reward": 1.97412109375, - "reward_std": 0.44091886281967163, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.95458984375, - "rewards/tag_count_reward/std": 0.15722467005252838, + "grad_norm": 2.4629106521606445, + "kl": 3.3984375, + "learning_rate": 1.0688834688455399e-07, + "loss": 0.1527, + "num_tokens": 1603749186.0, + "reward": 1.23876953125, + "reward_std": 0.5696773529052734, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.23828125, + "rewards/format_reward/std": 0.42644867300987244, + "rewards/tag_count_reward/mean": 0.89892578125, + "rewards/tag_count_reward/std": 0.21253253519535065, "step": 2783 }, { @@ -80722,27 +80722,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 830.234375, - "completions/mean_terminated_length": 778.1507568359375, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1052.49609375, + "completions/mean_terminated_length": 1014.1298217773438, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, "epoch": 0.9504139284799863, - "grad_norm": 2.9825966358184814, - "kl": 5.1796875, - "learning_rate": 1.067900426856939e-07, - "loss": 0.3634, - "num_tokens": 1479774613.0, - "reward": 1.88232421875, - "reward_std": 0.4768640995025635, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.94287109375, - "rewards/tag_count_reward/std": 0.1749558448791504, + "grad_norm": 1.5237226486206055, + "kl": 3.01171875, + "learning_rate": 1.0679518243291127e-07, + "loss": 0.1283, + "num_tokens": 1604357296.0, + "reward": 1.201171875, + "reward_std": 0.5628089904785156, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.263671875, + "rewards/format_reward/std": 0.4410543739795685, + "rewards/tag_count_reward/mean": 0.876953125, + "rewards/tag_count_reward/std": 0.2282540202140808, "step": 2784 }, { @@ -80751,27 +80751,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1948.0, - "completions/mean_length": 835.341796875, - "completions/mean_terminated_length": 786.0466918945312, - "completions/min_length": 39.0, - "completions/min_terminated_length": 39.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1008.515625, + "completions/mean_terminated_length": 964.0570678710938, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.9507553127933771, - "grad_norm": 2.1468441486358643, - "kl": 4.765625, - "learning_rate": 1.0669757757614602e-07, - "loss": 0.3044, - "num_tokens": 1480288452.0, - "reward": 1.8671875, - "reward_std": 0.47573575377464294, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.1783159226179123, + "grad_norm": 2.4121291637420654, + "kl": 3.20703125, + "learning_rate": 1.0670264750680906e-07, + "loss": 0.1476, + "num_tokens": 1604959800.0, + "reward": 1.19091796875, + "reward_std": 0.546492338180542, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.43343618512153625, + "rewards/tag_count_reward/mean": 0.88818359375, + "rewards/tag_count_reward/std": 0.21232111752033234, "step": 2785 }, { @@ -80780,27 +80780,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 796.875, - "completions/mean_terminated_length": 740.7020263671875, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1025.025390625, + "completions/mean_terminated_length": 956.8271484375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.9510966971067679, - "grad_norm": 1.9844145774841309, - "kl": 5.6640625, - "learning_rate": 1.0660574165330567e-07, - "loss": 0.399, - "num_tokens": 1480773092.0, - "reward": 1.94384765625, - "reward_std": 0.48710376024246216, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.908203125, - "rewards/format_reward/std": 0.289021372795105, - "rewards/tag_count_reward/mean": 0.94580078125, - "rewards/tag_count_reward/std": 0.18136507272720337, + "grad_norm": 2.8969452381134033, + "kl": 4.3046875, + "learning_rate": 1.0661074223768346e-07, + "loss": 0.2122, + "num_tokens": 1605561253.0, + "reward": 1.234375, + "reward_std": 0.5958242416381836, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.232421875, + "rewards/format_reward/std": 0.42278963327407837, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.23793669044971466, "step": 2786 }, { @@ -80809,27 +80809,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2018.0, - "completions/mean_length": 821.798828125, - "completions/mean_terminated_length": 779.6868896484375, - "completions/min_length": 77.0, - "completions/min_terminated_length": 77.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1088.86328125, + "completions/mean_terminated_length": 996.4411010742188, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.9514380814201587, - "grad_norm": 1.1855788230895996, - "kl": 5.01953125, - "learning_rate": 1.065145350475171e-07, - "loss": 0.3068, - "num_tokens": 1481271917.0, - "reward": 1.93017578125, - "reward_std": 0.506824254989624, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17463330924510956, + "grad_norm": 3.4848549365997314, + "kl": 4.12109375, + "learning_rate": 1.0651946675607618e-07, + "loss": 0.1972, + "num_tokens": 1606196815.0, + "reward": 1.193359375, + "reward_std": 0.6197876930236816, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.220703125, + "rewards/format_reward/std": 0.4151262938976288, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.2475108504295349, "step": 2787 }, { @@ -80838,27 +80838,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 851.8984375, - "completions/mean_terminated_length": 815.7987670898438, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1105.27734375, + "completions/mean_terminated_length": 1042.42919921875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.9517794657335495, - "grad_norm": 1.280172348022461, - "kl": 4.97265625, - "learning_rate": 1.0642395788823144e-07, - "loss": 0.3283, - "num_tokens": 1481787961.0, - "reward": 1.896484375, - "reward_std": 0.4289102554321289, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.900390625, - "rewards/format_reward/std": 0.29977133870124817, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.1781550794839859, + "grad_norm": 2.801020383834839, + "kl": 3.41015625, + "learning_rate": 1.064288211916344e-07, + "loss": 0.1365, + "num_tokens": 1606842589.0, + "reward": 1.2197265625, + "reward_std": 0.6192061305046082, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.28515625, + "rewards/format_reward/std": 0.45193037390708923, + "rewards/tag_count_reward/mean": 0.8642578125, + "rewards/tag_count_reward/std": 0.23900854587554932, "step": 2788 }, { @@ -80867,27 +80867,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1926.0, - "completions/mean_length": 798.341796875, - "completions/mean_terminated_length": 747.5426635742188, - "completions/min_length": 47.0, - "completions/min_terminated_length": 47.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1058.421875, + "completions/mean_terminated_length": 999.0062255859375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.9521208500469404, - "grad_norm": 1.7842223644256592, - "kl": 6.50390625, - "learning_rate": 1.0633401030400637e-07, - "loss": 0.4106, - "num_tokens": 1482275576.0, - "reward": 1.89013671875, - "reward_std": 0.4526803493499756, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.17277398705482483, + "grad_norm": 4.1184258460998535, + "kl": 3.03515625, + "learning_rate": 1.0633880567311049e-07, + "loss": 0.1369, + "num_tokens": 1607463365.0, + "reward": 1.30419921875, + "reward_std": 0.6136320233345032, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.345703125, + "rewards/format_reward/std": 0.4760620892047882, + "rewards/tag_count_reward/mean": 0.88623046875, + "rewards/tag_count_reward/std": 0.2241986095905304, "step": 2789 }, { @@ -80896,27 +80896,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 828.2421875, - "completions/mean_terminated_length": 778.6585083007812, - "completions/min_length": 44.0, - "completions/min_terminated_length": 44.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1144.951171875, + "completions/mean_terminated_length": 1051.5323486328125, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, "epoch": 0.9524622343603312, - "grad_norm": 1.977900743484497, - "kl": 5.6953125, - "learning_rate": 1.0624469242250607e-07, - "loss": 0.3367, - "num_tokens": 1482786180.0, - "reward": 1.90087890625, - "reward_std": 0.49716493487358093, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.18154938519001007, + "grad_norm": 2.695028305053711, + "kl": 3.61328125, + "learning_rate": 1.0624942032836199e-07, + "loss": 0.168, + "num_tokens": 1608136124.0, + "reward": 1.22216796875, + "reward_std": 0.6235449314117432, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.26953125, + "rewards/format_reward/std": 0.44415023922920227, + "rewards/tag_count_reward/mean": 0.84912109375, + "rewards/tag_count_reward/std": 0.2484082579612732, "step": 2790 }, { @@ -80925,27 +80925,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1967.0, - "completions/mean_length": 773.541015625, - "completions/mean_terminated_length": 742.9540405273438, - "completions/min_length": 36.0, - "completions/min_terminated_length": 36.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1049.470703125, + "completions/mean_terminated_length": 969.4197998046875, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, "epoch": 0.952803618673722, - "grad_norm": 1.6823440790176392, - "kl": 5.61328125, - "learning_rate": 1.0615600437050094e-07, - "loss": 0.329, - "num_tokens": 1483263497.0, - "reward": 1.96923828125, - "reward_std": 0.4446501135826111, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.908203125, - "rewards/format_reward/std": 0.289021372795105, - "rewards/tag_count_reward/mean": 0.95166015625, - "rewards/tag_count_reward/std": 0.15944552421569824, + "grad_norm": 5.9639458656311035, + "kl": 3.34375, + "learning_rate": 1.0616066528435134e-07, + "loss": 0.153, + "num_tokens": 1608754717.0, + "reward": 1.3115234375, + "reward_std": 0.6267759203910828, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.4639657139778137, + "rewards/tag_count_reward/mean": 0.8837890625, + "rewards/tag_count_reward/std": 0.22593770921230316, "step": 2791 }, { @@ -80954,27 +80954,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 799.955078125, - "completions/mean_terminated_length": 772.5529174804688, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1054.849609375, + "completions/mean_terminated_length": 986.427978515625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.9531450029871127, - "grad_norm": 1.415523886680603, - "kl": 6.4765625, - "learning_rate": 1.0606794627386739e-07, - "loss": 0.4152, - "num_tokens": 1483751570.0, - "reward": 1.859375, - "reward_std": 0.4958192706108093, - "rewards/accuracy_reward/mean": 0.05645161122083664, - "rewards/accuracy_reward/std": 0.23102475702762604, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.1843961924314499, + "grad_norm": 2.85768461227417, + "kl": 3.4921875, + "learning_rate": 1.0607254066714566e-07, + "loss": 0.1562, + "num_tokens": 1609373296.0, + "reward": 1.23828125, + "reward_std": 0.622559666633606, + "rewards/accuracy_reward/mean": 0.08669354766607285, + "rewards/accuracy_reward/std": 0.281669557094574, + "rewards/format_reward/mean": 0.283203125, + "rewards/format_reward/std": 0.4509948492050171, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.23716437816619873, "step": 2792 }, { @@ -80983,27 +80983,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 841.2734375, - "completions/mean_terminated_length": 781.9262084960938, - "completions/min_length": 23.0, - "completions/min_terminated_length": 23.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1033.12109375, + "completions/mean_terminated_length": 972.1863403320312, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.9534863873005035, - "grad_norm": 3.0615651607513428, - "kl": 8.5546875, - "learning_rate": 1.0598051825758785e-07, - "loss": 0.5124, - "num_tokens": 1484260062.0, - "reward": 1.859375, - "reward_std": 0.5691763162612915, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.8515625, - "rewards/format_reward/std": 0.35588082671165466, - "rewards/tag_count_reward/mean": 0.921875, - "rewards/tag_count_reward/std": 0.2021169811487198, + "grad_norm": 2.3304545879364014, + "kl": 2.82421875, + "learning_rate": 1.0598504660191671e-07, + "loss": 0.1105, + "num_tokens": 1609980014.0, + "reward": 1.27587890625, + "reward_std": 0.6237931251525879, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.263671875, + "rewards/format_reward/std": 0.4410543739795685, + "rewards/tag_count_reward/mean": 0.88134765625, + "rewards/tag_count_reward/std": 0.21775121986865997, "step": 2793 }, { @@ -81012,27 +81012,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1917.0, - "completions/mean_length": 836.037109375, - "completions/mean_terminated_length": 789.3285522460938, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1108.966796875, + "completions/mean_terminated_length": 1027.22509765625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, "epoch": 0.9538277716138943, - "grad_norm": 1.6534143686294556, - "kl": 6.7578125, - "learning_rate": 1.0589372044575035e-07, - "loss": 0.4172, - "num_tokens": 1484767281.0, - "reward": 1.8564453125, - "reward_std": 0.47521013021469116, - "rewards/accuracy_reward/mean": 0.038306452333927155, - "rewards/accuracy_reward/std": 0.19212883710861206, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.18472495675086975, + "grad_norm": 1.865417718887329, + "kl": 3.73828125, + "learning_rate": 1.0589818321294048e-07, + "loss": 0.1846, + "num_tokens": 1610626973.0, + "reward": 1.21337890625, + "reward_std": 0.5856899619102478, + "rewards/accuracy_reward/mean": 0.05040322616696358, + "rewards/accuracy_reward/std": 0.21899642050266266, + "rewards/format_reward/mean": 0.296875, + "rewards/format_reward/std": 0.45732781291007996, + "rewards/tag_count_reward/mean": 0.86767578125, + "rewards/tag_count_reward/std": 0.2318669855594635, "step": 2794 }, { @@ -81041,27 +81041,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 831.046875, - "completions/mean_terminated_length": 789.2525634765625, - "completions/min_length": 82.0, - "completions/min_terminated_length": 82.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1058.859375, + "completions/mean_terminated_length": 997.294677734375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, "epoch": 0.9541691559272851, - "grad_norm": 2.2055423259735107, - "kl": 7.1328125, - "learning_rate": 1.0580755296154857e-07, - "loss": 0.4278, - "num_tokens": 1485270873.0, - "reward": 1.8720703125, - "reward_std": 0.4822779595851898, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.873046875, - "rewards/format_reward/std": 0.33324605226516724, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.18272781372070312, + "grad_norm": 4.216657638549805, + "kl": 4.24609375, + "learning_rate": 1.058119506235973e-07, + "loss": 0.1657, + "num_tokens": 1611247205.0, + "reward": 1.2275390625, + "reward_std": 0.642439603805542, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.255859375, + "rewards/format_reward/std": 0.43676990270614624, + "rewards/tag_count_reward/mean": 0.8603515625, + "rewards/tag_count_reward/std": 0.24436962604522705, "step": 2795 }, { @@ -81070,27 +81070,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 825.9765625, - "completions/mean_terminated_length": 791.6224365234375, - "completions/min_length": 95.0, - "completions/min_terminated_length": 95.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1035.455078125, + "completions/mean_terminated_length": 979.0866088867188, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, "epoch": 0.9545105402406759, - "grad_norm": 2.20977783203125, - "kl": 7.453125, - "learning_rate": 1.0572201592728136e-07, - "loss": 0.4529, - "num_tokens": 1485766477.0, - "reward": 1.8544921875, - "reward_std": 0.5384957194328308, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.1813209503889084, + "grad_norm": 6.128505706787109, + "kl": 4.03515625, + "learning_rate": 1.0572634895637133e-07, + "loss": 0.1991, + "num_tokens": 1611850062.0, + "reward": 1.18798828125, + "reward_std": 0.5878502130508423, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.23828125, + "rewards/format_reward/std": 0.42644867300987244, + "rewards/tag_count_reward/mean": 0.85791015625, + "rewards/tag_count_reward/std": 0.24370937049388885, "step": 2796 }, { @@ -81099,27 +81099,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 833.939453125, - "completions/mean_terminated_length": 794.7761840820312, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1099.015625, + "completions/mean_terminated_length": 1007.5717163085938, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.9548519245540668, - "grad_norm": 1.5561234951019287, - "kl": 5.984375, - "learning_rate": 1.0563710946435309e-07, - "loss": 0.3802, - "num_tokens": 1486279038.0, - "reward": 1.88671875, - "reward_std": 0.48291343450546265, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.17270830273628235, + "grad_norm": 4.615504264831543, + "kl": 3.4765625, + "learning_rate": 1.0564137833285074e-07, + "loss": 0.1322, + "num_tokens": 1612498342.0, + "reward": 1.25146484375, + "reward_std": 0.6047146916389465, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.275390625, + "rewards/format_reward/std": 0.44714778661727905, + "rewards/tag_count_reward/mean": 0.87841796875, + "rewards/tag_count_reward/std": 0.23037032783031464, "step": 2797 }, { @@ -81128,27 +81128,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 846.279296875, - "completions/mean_terminated_length": 805.0081176757812, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1088.919921875, + "completions/mean_terminated_length": 1035.52783203125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, "epoch": 0.9551933088674576, - "grad_norm": 1.6758620738983154, - "kl": 5.45703125, - "learning_rate": 1.0555283369327283e-07, - "loss": 0.3251, - "num_tokens": 1486784653.0, - "reward": 1.84228515625, - "reward_std": 0.43197542428970337, - "rewards/accuracy_reward/mean": 0.013671875, - "rewards/accuracy_reward/std": 0.1162383034825325, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.93994140625, - "rewards/tag_count_reward/std": 0.18018634617328644, + "grad_norm": 4.33992338180542, + "kl": 3.74609375, + "learning_rate": 1.0555703887372733e-07, + "loss": 0.1356, + "num_tokens": 1613128189.0, + "reward": 1.22509765625, + "reward_std": 0.6068856120109558, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.1843547374010086, + "rewards/format_reward/mean": 0.330078125, + "rewards/format_reward/std": 0.47070086002349854, + "rewards/tag_count_reward/mean": 0.85986328125, + "rewards/tag_count_reward/std": 0.23515698313713074, "step": 2798 }, { @@ -81157,27 +81157,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 777.490234375, - "completions/mean_terminated_length": 746.998046875, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 971.728515625, + "completions/mean_terminated_length": 934.7656860351562, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.9555346931808484, - "grad_norm": 2.062755584716797, - "kl": 3.61328125, - "learning_rate": 1.0546918873365457e-07, - "loss": 0.2445, - "num_tokens": 1487258040.0, - "reward": 1.96044921875, - "reward_std": 0.4145369529724121, - "rewards/accuracy_reward/mean": 0.09072580933570862, - "rewards/accuracy_reward/std": 0.2875087857246399, - "rewards/format_reward/mean": 0.9140625, - "rewards/format_reward/std": 0.28054583072662354, - "rewards/tag_count_reward/mean": 0.95849609375, - "rewards/tag_count_reward/std": 0.1445726603269577, + "grad_norm": 3.814810276031494, + "kl": 3.359375, + "learning_rate": 1.0547333069879628e-07, + "loss": 0.1431, + "num_tokens": 1613701026.0, + "reward": 1.3251953125, + "reward_std": 0.6407530903816223, + "rewards/accuracy_reward/mean": 0.13508065044879913, + "rewards/accuracy_reward/std": 0.3421548008918762, + "rewards/format_reward/mean": 0.30859375, + "rewards/format_reward/std": 0.4623647928237915, + "rewards/tag_count_reward/mean": 0.8857421875, + "rewards/tag_count_reward/std": 0.21813978254795074, "step": 2799 }, { @@ -81186,27 +81186,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1979.0, - "completions/mean_length": 844.419921875, - "completions/mean_terminated_length": 810.5842895507812, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1090.88671875, + "completions/mean_terminated_length": 1016.3325805664062, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.9558760774942391, - "grad_norm": 1.3885384798049927, - "kl": 4.453125, - "learning_rate": 1.0538617470421715e-07, - "loss": 0.3121, - "num_tokens": 1487764111.0, - "reward": 1.9013671875, - "reward_std": 0.4131018817424774, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.90625, - "rewards/format_reward/std": 0.29176566004753113, - "rewards/tag_count_reward/mean": 0.9521484375, - "rewards/tag_count_reward/std": 0.16149762272834778, + "grad_norm": 2.81024432182312, + "kl": 3.55078125, + "learning_rate": 1.053902539269563e-07, + "loss": 0.1487, + "num_tokens": 1614333288.0, + "reward": 1.23779296875, + "reward_std": 0.5979641675949097, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.298828125, + "rewards/format_reward/std": 0.45819199085235596, + "rewards/tag_count_reward/mean": 0.86474609375, + "rewards/tag_count_reward/std": 0.2301669865846634, "step": 2800 }, { @@ -81215,27 +81215,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1953.0, - "completions/mean_length": 865.474609375, - "completions/mean_terminated_length": 827.32861328125, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1172.650390625, + "completions/mean_terminated_length": 1082.096923828125, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, "epoch": 0.9562174618076299, - "grad_norm": 1.4190959930419922, - "kl": 4.97265625, - "learning_rate": 1.0530379172278375e-07, - "loss": 0.3583, - "num_tokens": 1488289714.0, - "reward": 1.86181640625, - "reward_std": 0.46865037083625793, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.94384765625, - "rewards/tag_count_reward/std": 0.17031753063201904, + "grad_norm": 2.8505945205688477, + "kl": 3.390625, + "learning_rate": 1.0530780867620914e-07, + "loss": 0.1327, + "num_tokens": 1615016165.0, + "reward": 1.2001953125, + "reward_std": 0.624879002571106, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.298828125, + "rewards/format_reward/std": 0.45819199085235596, + "rewards/tag_count_reward/mean": 0.8408203125, + "rewards/tag_count_reward/std": 0.2532638609409332, "step": 2801 }, { @@ -81244,27 +81244,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 831.384765625, - "completions/mean_terminated_length": 802.18603515625, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1063.912109375, + "completions/mean_terminated_length": 982.7716674804688, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, "epoch": 0.9565588461210207, - "grad_norm": 1.2509667873382568, - "kl": 4.17578125, - "learning_rate": 1.0522203990628196e-07, - "loss": 0.2749, - "num_tokens": 1488792359.0, - "reward": 1.9306640625, - "reward_std": 0.47656846046447754, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.892578125, - "rewards/format_reward/std": 0.30995169281959534, - "rewards/tag_count_reward/mean": 0.9443359375, - "rewards/tag_count_reward/std": 0.162770614027977, + "grad_norm": 3.750340700149536, + "kl": 3.97265625, + "learning_rate": 1.0522599506365953e-07, + "loss": 0.1588, + "num_tokens": 1615637864.0, + "reward": 1.2666015625, + "reward_std": 0.6450583934783936, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.2890625, + "rewards/format_reward/std": 0.45377036929130554, + "rewards/tag_count_reward/mean": 0.8544921875, + "rewards/tag_count_reward/std": 0.24193312227725983, "step": 2802 }, { @@ -81273,27 +81273,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1968.0, - "completions/mean_length": 777.349609375, - "completions/mean_terminated_length": 749.4511108398438, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1051.740234375, + "completions/mean_terminated_length": 955.7409057617188, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.9569002304344115, - "grad_norm": 1.0817357301712036, - "kl": 4.1328125, - "learning_rate": 1.0514091937074349e-07, - "loss": 0.2997, - "num_tokens": 1489267802.0, - "reward": 1.93310546875, - "reward_std": 0.48178669810295105, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.900390625, - "rewards/format_reward/std": 0.29977133870124817, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.18456129729747772, + "grad_norm": 3.2822134494781494, + "kl": 3.2734375, + "learning_rate": 1.0514481320551505e-07, + "loss": 0.1663, + "num_tokens": 1616253795.0, + "reward": 1.22607421875, + "reward_std": 0.5891274213790894, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.23828125, + "rewards/format_reward/std": 0.42644867300987244, + "rewards/tag_count_reward/mean": 0.87255859375, + "rewards/tag_count_reward/std": 0.22337453067302704, "step": 2803 }, { @@ -81302,27 +81302,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 820.345703125, - "completions/mean_terminated_length": 798.3796997070312, - "completions/min_length": 81.0, - "completions/min_terminated_length": 81.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1047.03125, + "completions/mean_terminated_length": 997.80322265625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, "epoch": 0.9572416147478023, - "grad_norm": 1.4881634712219238, - "kl": 4.26953125, - "learning_rate": 1.050604302313042e-07, - "loss": 0.2953, - "num_tokens": 1489766955.0, - "reward": 1.94677734375, - "reward_std": 0.4905741214752197, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.95068359375, - "rewards/tag_count_reward/std": 0.15837518870830536, + "grad_norm": 1.7874726057052612, + "kl": 3.38671875, + "learning_rate": 1.0506426321708588e-07, + "loss": 0.1271, + "num_tokens": 1616869011.0, + "reward": 1.30029296875, + "reward_std": 0.6030961275100708, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.283203125, + "rewards/format_reward/std": 0.4509948492050171, + "rewards/tag_count_reward/mean": 0.88427734375, + "rewards/tag_count_reward/std": 0.21595340967178345, "step": 2804 }, { @@ -81331,27 +81331,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1871.0, - "completions/mean_length": 743.041015625, - "completions/mean_terminated_length": 727.5671997070312, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1000.15625, + "completions/mean_terminated_length": 957.5609130859375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, "epoch": 0.9575829990611932, - "grad_norm": 1.4065176248550415, - "kl": 3.58203125, - "learning_rate": 1.0498057260220361e-07, - "loss": 0.2319, - "num_tokens": 1490223936.0, - "reward": 1.9267578125, - "reward_std": 0.4127148687839508, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.91015625, - "rewards/format_reward/std": 0.2862374484539032, - "rewards/tag_count_reward/mean": 0.9560546875, - "rewards/tag_count_reward/std": 0.14511774480342865, + "grad_norm": 2.0482962131500244, + "kl": 3.28515625, + "learning_rate": 1.0498434521278483e-07, + "loss": 0.1503, + "num_tokens": 1617457635.0, + "reward": 1.19140625, + "reward_std": 0.5965229868888855, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.25390625, + "rewards/format_reward/std": 0.43567025661468506, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.24402722716331482, "step": 2805 }, { @@ -81360,27 +81360,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 876.193359375, - "completions/mean_terminated_length": 845.6653442382812, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1155.318359375, + "completions/mean_terminated_length": 1087.8046875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, "epoch": 0.957924383374584, - "grad_norm": 1.7111616134643555, - "kl": 6.0234375, - "learning_rate": 1.0490134659678501e-07, - "loss": 0.4275, - "num_tokens": 1490752083.0, - "reward": 1.85595703125, - "reward_std": 0.5251683592796326, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.1918918341398239, + "grad_norm": 3.1690008640289307, + "kl": 3.2265625, + "learning_rate": 1.0490505930612697e-07, + "loss": 0.1477, + "num_tokens": 1618128694.0, + "reward": 1.21728515625, + "reward_std": 0.6101462841033936, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.291015625, + "rewards/format_reward/std": 0.45467492938041687, + "rewards/tag_count_reward/mean": 0.85400390625, + "rewards/tag_count_reward/std": 0.24689576029777527, "step": 2806 }, { @@ -81389,27 +81389,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1978.0, - "completions/mean_length": 800.888671875, - "completions/mean_terminated_length": 776.0458374023438, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1047.83203125, + "completions/mean_terminated_length": 1000.7893676757812, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, "epoch": 0.9582657676879748, - "grad_norm": 1.5385019779205322, - "kl": 4.2265625, - "learning_rate": 1.0482275232749527e-07, - "loss": 0.2499, - "num_tokens": 1491240234.0, - "reward": 1.95068359375, - "reward_std": 0.39756447076797485, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.9140625, - "rewards/format_reward/std": 0.28054583072662354, - "rewards/tag_count_reward/mean": 0.95849609375, - "rewards/tag_count_reward/std": 0.1445726603269577, + "grad_norm": 2.6651482582092285, + "kl": 3.17578125, + "learning_rate": 1.0482640560972955e-07, + "loss": 0.1164, + "num_tokens": 1618743280.0, + "reward": 1.2900390625, + "reward_std": 0.6120122671127319, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.306640625, + "rewards/format_reward/std": 0.4615498185157776, + "rewards/tag_count_reward/mean": 0.8818359375, + "rewards/tag_count_reward/std": 0.22546352446079254, "step": 2807 }, { @@ -81418,27 +81418,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 813.478515625, - "completions/mean_terminated_length": 783.8500366210938, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1096.685546875, + "completions/mean_terminated_length": 1013.8748168945312, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.9586071520013656, - "grad_norm": 1.1500355005264282, - "kl": 5.3203125, - "learning_rate": 1.0474478990588456e-07, - "loss": 0.3168, - "num_tokens": 1491737615.0, - "reward": 1.85009765625, - "reward_std": 0.4937742054462433, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.1898845136165619, + "grad_norm": 2.6078107357025146, + "kl": 3.40625, + "learning_rate": 1.0474838423531176e-07, + "loss": 0.158, + "num_tokens": 1619385663.0, + "reward": 1.244140625, + "reward_std": 0.6563979387283325, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.310546875, + "rewards/format_reward/std": 0.46317005157470703, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.26168274879455566, "step": 2808 }, { @@ -81447,27 +81447,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 741.298828125, - "completions/mean_terminated_length": 728.4122314453125, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1014.033203125, + "completions/mean_terminated_length": 949.678466796875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, "epoch": 0.9589485363147563, - "grad_norm": 1.122118592262268, - "kl": 3.59765625, - "learning_rate": 1.0466745944260631e-07, - "loss": 0.2334, - "num_tokens": 1492194920.0, - "reward": 1.87744140625, - "reward_std": 0.4407830834388733, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.94970703125, - "rewards/tag_count_reward/std": 0.15175074338912964, + "grad_norm": 5.4195122718811035, + "kl": 2.94921875, + "learning_rate": 1.0467099529369473e-07, + "loss": 0.1527, + "num_tokens": 1619982608.0, + "reward": 1.216796875, + "reward_std": 0.5864124298095703, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.287109375, + "rewards/format_reward/std": 0.45285552740097046, + "rewards/tag_count_reward/mean": 0.869140625, + "rewards/tag_count_reward/std": 0.22872239351272583, "step": 2809 }, { @@ -81476,27 +81476,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 879.783203125, - "completions/mean_terminated_length": 846.9417114257812, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1155.8359375, + "completions/mean_terminated_length": 1100.30712890625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, "epoch": 0.9592899206281471, - "grad_norm": 1.481045126914978, - "kl": 5.234375, - "learning_rate": 1.0459076104741699e-07, - "loss": 0.3218, - "num_tokens": 1492723961.0, - "reward": 1.86279296875, - "reward_std": 0.5158882141113281, + "grad_norm": 1.3648179769515991, + "kl": 2.8046875, + "learning_rate": 1.0459423889480126e-07, + "loss": 0.0923, + "num_tokens": 1620652988.0, + "reward": 1.21630859375, + "reward_std": 0.6065263152122498, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.1777946949005127, + "rewards/format_reward/mean": 0.296875, + "rewards/format_reward/std": 0.45732781291007996, + "rewards/tag_count_reward/mean": 0.86669921875, + "rewards/tag_count_reward/std": 0.2286466658115387, "step": 2810 }, { @@ -81505,27 +81505,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 777.560546875, - "completions/mean_terminated_length": 752.2529907226562, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1092.21875, + "completions/mean_terminated_length": 1009.0191650390625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, "epoch": 0.9596313049415379, - "grad_norm": 1.2403146028518677, - "kl": 4.7578125, - "learning_rate": 1.045146948291758e-07, - "loss": 0.32, - "num_tokens": 1493210216.0, - "reward": 1.95751953125, - "reward_std": 0.5043487548828125, - "rewards/accuracy_reward/mean": 0.125, - "rewards/accuracy_reward/std": 0.3310423493385315, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94384765625, - "rewards/tag_count_reward/std": 0.15763740241527557, + "grad_norm": 2.480170726776123, + "kl": 3.28125, + "learning_rate": 1.0451811514765569e-07, + "loss": 0.1393, + "num_tokens": 1621300348.0, + "reward": 1.35693359375, + "reward_std": 0.6668035984039307, + "rewards/accuracy_reward/mean": 0.169921875, + "rewards/accuracy_reward/std": 0.3759314715862274, + "rewards/format_reward/mean": 0.322265625, + "rewards/format_reward/std": 0.46780112385749817, + "rewards/tag_count_reward/mean": 0.86474609375, + "rewards/tag_count_reward/std": 0.2374899983406067, "step": 2811 }, { @@ -81534,27 +81534,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.095703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 837.96875, - "completions/mean_terminated_length": 781.05517578125, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1169.443359375, + "completions/mean_terminated_length": 1076.46435546875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.9599726892549287, - "grad_norm": 1.8360158205032349, - "kl": 7.0390625, - "learning_rate": 1.0443926089584498e-07, - "loss": 0.4611, - "num_tokens": 1493719512.0, - "reward": 1.84912109375, - "reward_std": 0.532717227935791, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.1996619999408722, + "grad_norm": 3.8796234130859375, + "kl": 2.658203125, + "learning_rate": 1.0444262416038376e-07, + "loss": 0.1199, + "num_tokens": 1621979359.0, + "reward": 1.2578125, + "reward_std": 0.6507127285003662, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.3515625, + "rewards/format_reward/std": 0.4779251217842102, + "rewards/tag_count_reward/mean": 0.853515625, + "rewards/tag_count_reward/std": 0.24980881810188293, "step": 2812 }, { @@ -81563,27 +81563,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 775.24609375, - "completions/mean_terminated_length": 752.47314453125, - "completions/min_length": 51.0, - "completions/min_terminated_length": 51.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1084.6171875, + "completions/mean_terminated_length": 998.5276489257812, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.9603140735683195, - "grad_norm": 1.177033543586731, - "kl": 5.24609375, - "learning_rate": 1.0436445935448916e-07, - "loss": 0.3232, - "num_tokens": 1494202086.0, - "reward": 1.89501953125, - "reward_std": 0.5035998821258545, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17602849006652832, + "grad_norm": 3.1738040447235107, + "kl": 3.515625, + "learning_rate": 1.0436776604021244e-07, + "loss": 0.1825, + "num_tokens": 1622620331.0, + "reward": 1.2802734375, + "reward_std": 0.6633602976799011, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.33203125, + "rewards/format_reward/std": 0.47140273451805115, + "rewards/tag_count_reward/mean": 0.8564453125, + "rewards/tag_count_reward/std": 0.24659612774848938, "step": 2813 }, { @@ -81592,27 +81592,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 768.931640625, - "completions/mean_terminated_length": 748.6290283203125, - "completions/min_length": 124.0, - "completions/min_terminated_length": 124.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1077.306640625, + "completions/mean_terminated_length": 1016.8900756835938, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.9606554578817104, - "grad_norm": 1.8969199657440186, - "kl": 4.5859375, - "learning_rate": 1.0429029031127539e-07, - "loss": 0.3024, - "num_tokens": 1494671603.0, - "reward": 1.943359375, - "reward_std": 0.48436421155929565, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.908203125, - "rewards/format_reward/std": 0.289021372795105, - "rewards/tag_count_reward/mean": 0.951171875, - "rewards/tag_count_reward/std": 0.16421131789684296, + "grad_norm": 3.4238946437835693, + "kl": 3.609375, + "learning_rate": 1.0429354089346976e-07, + "loss": 0.1616, + "num_tokens": 1623247736.0, + "reward": 1.28662109375, + "reward_std": 0.6705954074859619, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4754233956336975, + "rewards/tag_count_reward/mean": 0.85498046875, + "rewards/tag_count_reward/std": 0.24448835849761963, "step": 2814 }, { @@ -81621,27 +81621,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 727.548828125, - "completions/mean_terminated_length": 690.4276733398438, - "completions/min_length": 94.0, - "completions/min_terminated_length": 94.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1046.07421875, + "completions/mean_terminated_length": 968.0294189453125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, "epoch": 0.9609968421951012, - "grad_norm": 1.0462123155593872, - "kl": 6.5234375, - "learning_rate": 1.042167538714731e-07, - "loss": 0.4106, - "num_tokens": 1495124076.0, - "reward": 1.837890625, - "reward_std": 0.4597313106060028, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.17131954431533813, + "grad_norm": 3.464841842651367, + "kl": 4.03125, + "learning_rate": 1.0421994882558466e-07, + "loss": 0.1979, + "num_tokens": 1623863294.0, + "reward": 1.193359375, + "reward_std": 0.6017947196960449, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15143637359142303, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.4639657139778137, + "rewards/tag_count_reward/mean": 0.857421875, + "rewards/tag_count_reward/std": 0.24216407537460327, "step": 2815 }, { @@ -81650,27 +81650,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2005.0, - "completions/mean_length": 752.203125, - "completions/mean_terminated_length": 713.0945434570312, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1066.017578125, + "completions/mean_terminated_length": 1021.9285278320312, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.961338226508492, - "grad_norm": 2.0134124755859375, - "kl": 5.66015625, - "learning_rate": 1.0414385013945384e-07, - "loss": 0.3713, - "num_tokens": 1495589060.0, - "reward": 1.89794921875, - "reward_std": 0.4629552364349365, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.16919739544391632, + "grad_norm": 4.695958614349365, + "kl": 3.55078125, + "learning_rate": 1.0414698994108689e-07, + "loss": 0.1653, + "num_tokens": 1624488951.0, + "reward": 1.33544921875, + "reward_std": 0.657383382320404, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.380859375, + "rewards/format_reward/std": 0.48607301712036133, + "rewards/tag_count_reward/mean": 0.87060546875, + "rewards/tag_count_reward/std": 0.23869800567626953, "step": 2816 }, { @@ -81679,27 +81679,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 740.59375, - "completions/mean_terminated_length": 722.4713134765625, - "completions/min_length": 65.0, - "completions/min_terminated_length": 65.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1063.79296875, + "completions/mean_terminated_length": 1011.139892578125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.9616796108218827, - "grad_norm": 0.7559698820114136, - "kl": 4.0234375, - "learning_rate": 1.040715792186911e-07, - "loss": 0.2399, - "num_tokens": 1496041124.0, - "reward": 1.958984375, - "reward_std": 0.3937632739543915, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.9296875, - "rewards/format_reward/std": 0.25592297315597534, - "rewards/tag_count_reward/mean": 0.95703125, - "rewards/tag_count_reward/std": 0.15438589453697205, + "grad_norm": 3.1592657566070557, + "kl": 4.0546875, + "learning_rate": 1.0407466434360689e-07, + "loss": 0.1743, + "num_tokens": 1625106493.0, + "reward": 1.3359375, + "reward_std": 0.6510826349258423, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.38671875, + "rewards/format_reward/std": 0.48747459053993225, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.2365511655807495, "step": 2817 }, { @@ -81708,27 +81708,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2042.0, - "completions/mean_length": 772.58984375, - "completions/mean_terminated_length": 723.4360961914062, - "completions/min_length": 67.0, - "completions/min_terminated_length": 67.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1111.603515625, + "completions/mean_terminated_length": 1034.3953857421875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.9620209951352735, - "grad_norm": 1.7330492734909058, - "kl": 7.6171875, - "learning_rate": 1.0399994121176025e-07, - "loss": 0.5064, - "num_tokens": 1496510226.0, - "reward": 1.86181640625, - "reward_std": 0.5165878534317017, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.19931232929229736, + "grad_norm": 1.8703370094299316, + "kl": 3.9296875, + "learning_rate": 1.0400297213587539e-07, + "loss": 0.1829, + "num_tokens": 1625749170.0, + "reward": 1.2568359375, + "reward_std": 0.6568077206611633, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.349609375, + "rewards/format_reward/std": 0.47731292247772217, + "rewards/tag_count_reward/mean": 0.8505859375, + "rewards/tag_count_reward/std": 0.24658063054084778, "step": 2818 }, { @@ -81737,27 +81737,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2002.0, - "completions/mean_length": 731.984375, - "completions/mean_terminated_length": 708.4373168945312, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1029.09375, + "completions/mean_terminated_length": 965.6763916015625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.9623623794486643, - "grad_norm": 0.916216254234314, - "kl": 5.51953125, - "learning_rate": 1.039289362203383e-07, - "loss": 0.3396, - "num_tokens": 1496957050.0, - "reward": 1.912109375, - "reward_std": 0.4761887192726135, - "rewards/accuracy_reward/mean": 0.06653226166963577, - "rewards/accuracy_reward/std": 0.2494617998600006, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.94921875, - "rewards/tag_count_reward/std": 0.16361670196056366, + "grad_norm": 5.221717357635498, + "kl": 4.828125, + "learning_rate": 1.0393191341972373e-07, + "loss": 0.2061, + "num_tokens": 1626348114.0, + "reward": 1.24267578125, + "reward_std": 0.6536946296691895, + "rewards/accuracy_reward/mean": 0.07459677755832672, + "rewards/accuracy_reward/std": 0.263004869222641, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.4699897766113281, + "rewards/tag_count_reward/mean": 0.84228515625, + "rewards/tag_count_reward/std": 0.24611273407936096, "step": 2819 }, { @@ -81766,27 +81766,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 743.6640625, - "completions/mean_terminated_length": 709.683349609375, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1034.189453125, + "completions/mean_terminated_length": 971.0892333984375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.9627037637620551, - "grad_norm": 1.6667364835739136, - "kl": 6.5703125, - "learning_rate": 1.0385856434520387e-07, - "loss": 0.3979, - "num_tokens": 1497408062.0, - "reward": 1.9189453125, - "reward_std": 0.4979446232318878, - "rewards/accuracy_reward/mean": 0.083984375, - "rewards/accuracy_reward/std": 0.2776356339454651, - "rewards/format_reward/mean": 0.896484375, - "rewards/format_reward/std": 0.30492907762527466, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.18406164646148682, + "grad_norm": 2.284738540649414, + "kl": 4.3984375, + "learning_rate": 1.0386148829608319e-07, + "loss": 0.1933, + "num_tokens": 1626947875.0, + "reward": 1.34326171875, + "reward_std": 0.680327296257019, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.37890625, + "rewards/format_reward/std": 0.4855891764163971, + "rewards/tag_count_reward/mean": 0.85302734375, + "rewards/tag_count_reward/std": 0.24432024359703064, "step": 2820 }, { @@ -81795,27 +81795,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1958.0, - "completions/mean_length": 714.109375, - "completions/mean_terminated_length": 682.0960083007812, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1019.501953125, + "completions/mean_terminated_length": 934.6997680664062, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.963045148075446, - "grad_norm": 2.4002110958099365, - "kl": 5.39453125, - "learning_rate": 1.0378882568623697e-07, - "loss": 0.2916, - "num_tokens": 1497846518.0, - "reward": 1.927734375, - "reward_std": 0.47919708490371704, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.17625701427459717, + "grad_norm": 2.2609078884124756, + "kl": 4.0078125, + "learning_rate": 1.0379169686498522e-07, + "loss": 0.1632, + "num_tokens": 1627542692.0, + "reward": 1.3291015625, + "reward_std": 0.6488129496574402, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.353515625, + "rewards/format_reward/std": 0.47852855920791626, + "rewards/tag_count_reward/mean": 0.8681640625, + "rewards/tag_count_reward/std": 0.23135384917259216, "step": 2821 }, { @@ -81824,27 +81824,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2012.0, - "completions/mean_length": 740.388671875, - "completions/mean_terminated_length": 709.0060424804688, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1045.08203125, + "completions/mean_terminated_length": 997.9099731445312, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, "epoch": 0.9633865323888368, - "grad_norm": 1.3741546869277954, - "kl": 6.1640625, - "learning_rate": 1.037197203424189e-07, - "loss": 0.4227, - "num_tokens": 1498310253.0, - "reward": 1.91650390625, - "reward_std": 0.4642665982246399, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.90234375, - "rewards/format_reward/std": 0.29713961482048035, - "rewards/tag_count_reward/mean": 0.94970703125, - "rewards/tag_count_reward/std": 0.16188986599445343, + "grad_norm": 2.919736862182617, + "kl": 4.4921875, + "learning_rate": 1.0372253922556121e-07, + "loss": 0.1851, + "num_tokens": 1628162430.0, + "reward": 1.3046875, + "reward_std": 0.6720693111419678, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4754233956336975, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.24468418955802917, "step": 2822 }, { @@ -81853,27 +81853,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05859375, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 841.515625, - "completions/mean_terminated_length": 766.4232788085938, - "completions/min_length": 67.0, - "completions/min_terminated_length": 67.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1126.09765625, + "completions/mean_terminated_length": 1045.84716796875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.9637279167022276, - "grad_norm": 3.3093695640563965, - "kl": 9.9765625, - "learning_rate": 1.0365124841183199e-07, - "loss": 0.6156, - "num_tokens": 1498815157.0, - "reward": 1.765625, - "reward_std": 0.5490684509277344, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.828125, - "rewards/format_reward/std": 0.3776407241821289, - "rewards/tag_count_reward/mean": 0.90625, - "rewards/tag_count_reward/std": 0.2178439050912857, + "grad_norm": 2.1116275787353516, + "kl": 4.6640625, + "learning_rate": 1.0365401547604226e-07, + "loss": 0.1899, + "num_tokens": 1628813040.0, + "reward": 1.28955078125, + "reward_std": 0.6486604809761047, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.380859375, + "rewards/format_reward/std": 0.48607301712036133, + "rewards/tag_count_reward/mean": 0.85791015625, + "rewards/tag_count_reward/std": 0.24017061293125153, "step": 2823 }, { @@ -81882,27 +81882,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1923.0, - "completions/mean_length": 770.0625, - "completions/mean_terminated_length": 734.1365356445312, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1088.59765625, + "completions/mean_terminated_length": 1000.6354370117188, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.9640693010156184, - "grad_norm": 2.1383633613586426, - "kl": 6.3671875, - "learning_rate": 1.0358340999165966e-07, - "loss": 0.4144, - "num_tokens": 1499294293.0, - "reward": 1.8857421875, - "reward_std": 0.5147566795349121, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.17451083660125732, + "grad_norm": 2.6560699939727783, + "kl": 3.765625, + "learning_rate": 1.0358612571375903e-07, + "loss": 0.1343, + "num_tokens": 1629455266.0, + "reward": 1.29248046875, + "reward_std": 0.6589604616165161, + "rewards/accuracy_reward/mean": 0.068359375, + "rewards/accuracy_reward/std": 0.25260838866233826, + "rewards/format_reward/mean": 0.373046875, + "rewards/format_reward/std": 0.48408737778663635, + "rewards/tag_count_reward/mean": 0.85107421875, + "rewards/tag_count_reward/std": 0.2395850569009781, "step": 2824 }, { @@ -81911,27 +81911,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 742.251953125, - "completions/mean_terminated_length": 700.1310424804688, - "completions/min_length": 88.0, - "completions/min_terminated_length": 88.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1092.1015625, + "completions/mean_terminated_length": 1004.4605712890625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.9644106853290091, - "grad_norm": 1.4135212898254395, - "kl": 6.84375, - "learning_rate": 1.035162051781861e-07, - "loss": 0.4496, - "num_tokens": 1499755478.0, - "reward": 1.8916015625, - "reward_std": 0.491312712430954, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.17864517867565155, + "grad_norm": 2.799614906311035, + "kl": 4.515625, + "learning_rate": 1.0351887003514184e-07, + "loss": 0.2249, + "num_tokens": 1630095574.0, + "reward": 1.35498046875, + "reward_std": 0.6816108226776123, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.38671875, + "rewards/format_reward/std": 0.48747459053993225, + "rewards/tag_count_reward/mean": 0.84912109375, + "rewards/tag_count_reward/std": 0.25134512782096863, "step": 2825 }, { @@ -81940,27 +81940,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1872.0, - "completions/mean_length": 737.171875, - "completions/mean_terminated_length": 708.3912353515625, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1085.35546875, + "completions/mean_terminated_length": 1012.5504760742188, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.9647520696423999, - "grad_norm": 1.6564372777938843, - "kl": 6.53125, - "learning_rate": 1.0344963406679633e-07, - "loss": 0.4448, - "num_tokens": 1500214430.0, - "reward": 1.90380859375, - "reward_std": 0.4616513252258301, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94677734375, - "rewards/tag_count_reward/std": 0.16544538736343384, + "grad_norm": 6.948239326477051, + "kl": 5.015625, + "learning_rate": 1.0345224853572018e-07, + "loss": 0.2017, + "num_tokens": 1630732796.0, + "reward": 1.28564453125, + "reward_std": 0.6930222511291504, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.380859375, + "rewards/format_reward/std": 0.48607301712036133, + "rewards/tag_count_reward/mean": 0.83056640625, + "rewards/tag_count_reward/std": 0.25790491700172424, "step": 2826 }, { @@ -81969,27 +81969,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1922.0, - "completions/mean_length": 758.19140625, - "completions/mean_terminated_length": 708.4827270507812, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1062.279296875, + "completions/mean_terminated_length": 994.3695678710938, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.9650934539557907, - "grad_norm": 1.8987891674041748, - "kl": 6.40234375, - "learning_rate": 1.0338369675197584e-07, - "loss": 0.4253, - "num_tokens": 1500675296.0, - "reward": 1.83447265625, - "reward_std": 0.5152586102485657, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.1911684274673462, + "grad_norm": 1.7742729187011719, + "kl": 5.0546875, + "learning_rate": 1.0338626131012295e-07, + "loss": 0.2409, + "num_tokens": 1631349355.0, + "reward": 1.28955078125, + "reward_std": 0.6635084748268127, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.373046875, + "rewards/format_reward/std": 0.48408737778663635, + "rewards/tag_count_reward/mean": 0.85205078125, + "rewards/tag_count_reward/std": 0.2467215359210968, "step": 2827 }, { @@ -81998,27 +81998,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1682.0, - "completions/mean_length": 707.9921875, - "completions/mean_terminated_length": 670.3212890625, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1079.55859375, + "completions/mean_terminated_length": 1001.9197998046875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.9654348382691815, - "grad_norm": 1.7056914567947388, - "kl": 6.8984375, - "learning_rate": 1.0331839332731053e-07, - "loss": 0.4827, - "num_tokens": 1501127884.0, - "reward": 1.8330078125, - "reward_std": 0.5483074188232422, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.9208984375, - "rewards/tag_count_reward/std": 0.20593629777431488, + "grad_norm": 2.163330316543579, + "kl": 4.65625, + "learning_rate": 1.0332090845207793e-07, + "loss": 0.2526, + "num_tokens": 1631992185.0, + "reward": 1.265625, + "reward_std": 0.6679297685623169, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.349609375, + "rewards/format_reward/std": 0.47731292247772217, + "rewards/tag_count_reward/mean": 0.833984375, + "rewards/tag_count_reward/std": 0.25609469413757324, "step": 2828 }, { @@ -82027,27 +82027,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 705.625, - "completions/mean_terminated_length": 673.4080200195312, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1021.9296875, + "completions/mean_terminated_length": 953.5250244140625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.9657762225825723, - "grad_norm": 2.8771121501922607, - "kl": 6.9296875, - "learning_rate": 1.0325372388548673e-07, - "loss": 0.4234, - "num_tokens": 1501569340.0, - "reward": 1.90087890625, - "reward_std": 0.5529721975326538, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.19808122515678406, + "grad_norm": 4.428511619567871, + "kl": 4.60546875, + "learning_rate": 1.0325619005441191e-07, + "loss": 0.2033, + "num_tokens": 1632595589.0, + "reward": 1.34423828125, + "reward_std": 0.6817231178283691, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.353515625, + "rewards/format_reward/std": 0.47852855920791626, + "rewards/tag_count_reward/mean": 0.85205078125, + "rewards/tag_count_reward/std": 0.24771103262901306, "step": 2829 }, { @@ -82056,27 +82056,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1982.0, - "completions/mean_length": 727.29296875, - "completions/mean_terminated_length": 708.9861450195312, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1016.11328125, + "completions/mean_terminated_length": 951.8880004882812, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.9661176068959632, - "grad_norm": 1.8524610996246338, - "kl": 5.8359375, - "learning_rate": 1.0318968851829084e-07, - "loss": 0.3779, - "num_tokens": 1502022946.0, - "reward": 1.89208984375, - "reward_std": 0.4285368025302887, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94482421875, - "rewards/tag_count_reward/std": 0.17348045110702515, + "grad_norm": 2.3535547256469727, + "kl": 4.171875, + "learning_rate": 1.0319210620905063e-07, + "loss": 0.1894, + "num_tokens": 1633197071.0, + "reward": 1.32666015625, + "reward_std": 0.6631171107292175, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.37890625, + "rewards/format_reward/std": 0.4855891764163971, + "rewards/tag_count_reward/mean": 0.85986328125, + "rewards/tag_count_reward/std": 0.2483120709657669, "step": 2830 }, { @@ -82085,27 +82085,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 768.84765625, - "completions/mean_terminated_length": 714.1385498046875, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1085.890625, + "completions/mean_terminated_length": 1026.00830078125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, "epoch": 0.966458991209354, - "grad_norm": 2.203659772872925, - "kl": 8.4375, - "learning_rate": 1.031262873166094e-07, - "loss": 0.5622, - "num_tokens": 1502492548.0, - "reward": 1.84619140625, - "reward_std": 0.5414794683456421, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.91650390625, - "rewards/tag_count_reward/std": 0.2056826800107956, + "grad_norm": 1.8867931365966797, + "kl": 3.953125, + "learning_rate": 1.0312865700701835e-07, + "loss": 0.1655, + "num_tokens": 1633828999.0, + "reward": 1.3857421875, + "reward_std": 0.676786482334137, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.41015625, + "rewards/format_reward/std": 0.49234291911125183, + "rewards/tag_count_reward/mean": 0.8583984375, + "rewards/tag_count_reward/std": 0.24122115969657898, "step": 2831 }, { @@ -82114,27 +82114,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 744.240234375, - "completions/mean_terminated_length": 704.8912963867188, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1037.892578125, + "completions/mean_terminated_length": 970.5521240234375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.9668003755227448, - "grad_norm": 1.5507378578186035, - "kl": 6.28515625, - "learning_rate": 1.0306352037042878e-07, - "loss": 0.4382, - "num_tokens": 1502951487.0, - "reward": 1.8828125, - "reward_std": 0.5205909013748169, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.931640625, - "rewards/tag_count_reward/std": 0.19337067008018494, + "grad_norm": 3.9254069328308105, + "kl": 4.59765625, + "learning_rate": 1.0306584253843792e-07, + "loss": 0.2023, + "num_tokens": 1634438288.0, + "reward": 1.34326171875, + "reward_std": 0.7044610381126404, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.38671875, + "rewards/format_reward/std": 0.48747459053993225, + "rewards/tag_count_reward/mean": 0.83544921875, + "rewards/tag_count_reward/std": 0.25632545351982117, "step": 2832 }, { @@ -82143,27 +82143,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2041.0, - "completions/mean_length": 771.90234375, - "completions/mean_terminated_length": 733.3883056640625, - "completions/min_length": 66.0, - "completions/min_terminated_length": 66.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1042.408203125, + "completions/mean_terminated_length": 988.611083984375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, "epoch": 0.9671417598361355, - "grad_norm": 2.5833253860473633, - "kl": 6.265625, - "learning_rate": 1.0300138776883521e-07, - "loss": 0.4523, - "num_tokens": 1503416205.0, - "reward": 1.85107421875, - "reward_std": 0.49749863147735596, - "rewards/accuracy_reward/mean": 0.037109375, - "rewards/accuracy_reward/std": 0.18921469151973724, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.1909683793783188, + "grad_norm": 2.477076530456543, + "kl": 4.23828125, + "learning_rate": 1.030036628925307e-07, + "loss": 0.1836, + "num_tokens": 1635041505.0, + "reward": 1.28271484375, + "reward_std": 0.6709230542182922, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.373046875, + "rewards/format_reward/std": 0.48408737778663635, + "rewards/tag_count_reward/mean": 0.83935546875, + "rewards/tag_count_reward/std": 0.2491648942232132, "step": 2833 }, { @@ -82172,27 +82172,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 780.7109375, - "completions/mean_terminated_length": 745.0842895507812, - "completions/min_length": 31.0, - "completions/min_terminated_length": 31.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1065.828125, + "completions/mean_terminated_length": 1019.6318969726562, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.9674831441495263, - "grad_norm": 1.1714969873428345, - "kl": 7.078125, - "learning_rate": 1.0293988960001453e-07, - "loss": 0.4828, - "num_tokens": 1503891801.0, - "reward": 1.8349609375, - "reward_std": 0.531904399394989, - "rewards/accuracy_reward/mean": 0.041015625, - "rewards/accuracy_reward/std": 0.19852031767368317, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9248046875, - "rewards/tag_count_reward/std": 0.19958361983299255, + "grad_norm": 4.711578369140625, + "kl": 3.0859375, + "learning_rate": 1.0294211815761628e-07, + "loss": 0.0797, + "num_tokens": 1635663081.0, + "reward": 1.35498046875, + "reward_std": 0.6483269929885864, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.416015625, + "rewards/format_reward/std": 0.493378221988678, + "rewards/tag_count_reward/mean": 0.88232421875, + "rewards/tag_count_reward/std": 0.22051148116588593, "step": 2834 }, { @@ -82201,27 +82201,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 825.32421875, - "completions/mean_terminated_length": 762.5585327148438, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1058.396484375, + "completions/mean_terminated_length": 1016.0713500976562, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, "epoch": 0.9678245284629171, - "grad_norm": 1.9260815382003784, - "kl": 7.890625, - "learning_rate": 1.0287902595125212e-07, - "loss": 0.5501, - "num_tokens": 1504382223.0, - "reward": 1.787109375, - "reward_std": 0.5436063408851624, - "rewards/accuracy_reward/mean": 0.03125, - "rewards/accuracy_reward/std": 0.17416280508041382, - "rewards/format_reward/mean": 0.83984375, - "rewards/format_reward/std": 0.3671095669269562, - "rewards/tag_count_reward/mean": 0.916015625, - "rewards/tag_count_reward/std": 0.2087314873933792, + "grad_norm": 2.1096463203430176, + "kl": 3.28125, + "learning_rate": 1.028812084211124e-07, + "loss": 0.1047, + "num_tokens": 1636272836.0, + "reward": 1.318359375, + "reward_std": 0.6351197957992554, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.38671875, + "rewards/format_reward/std": 0.48747459053993225, + "rewards/tag_count_reward/mean": 0.876953125, + "rewards/tag_count_reward/std": 0.21951310336589813, "step": 2835 }, { @@ -82230,27 +82230,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 784.662109375, - "completions/mean_terminated_length": 735.9736328125, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1119.1875, + "completions/mean_terminated_length": 1079.46240234375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, "epoch": 0.9681659127763079, - "grad_norm": 2.5262982845306396, - "kl": 6.96875, - "learning_rate": 1.0281879690893287e-07, - "loss": 0.4852, - "num_tokens": 1504863810.0, - "reward": 1.865234375, - "reward_std": 0.5836403369903564, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.923828125, - "rewards/tag_count_reward/std": 0.2034645974636078, + "grad_norm": 3.1537880897521973, + "kl": 3.49609375, + "learning_rate": 1.0282093376953499e-07, + "loss": 0.1405, + "num_tokens": 1636925700.0, + "reward": 1.40771484375, + "reward_std": 0.7227873802185059, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.4453125, + "rewards/format_reward/std": 0.49748632311820984, + "rewards/tag_count_reward/mean": 0.85302734375, + "rewards/tag_count_reward/std": 0.24730566143989563, "step": 2836 }, { @@ -82259,27 +82259,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 814.23046875, - "completions/mean_terminated_length": 779.546142578125, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1109.056640625, + "completions/mean_terminated_length": 1048.5426025390625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, "epoch": 0.9685072970896987, - "grad_norm": 0.7660391926765442, - "kl": 6.9921875, - "learning_rate": 1.0275920255854082e-07, - "loss": 0.4373, - "num_tokens": 1505357368.0, - "reward": 1.85009765625, - "reward_std": 0.49968987703323364, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.93408203125, - "rewards/tag_count_reward/std": 0.18219561874866486, + "grad_norm": 1.5795732736587524, + "kl": 4.00390625, + "learning_rate": 1.0276129428849773e-07, + "loss": 0.1678, + "num_tokens": 1637570209.0, + "reward": 1.31787109375, + "reward_std": 0.702187180519104, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.3984375, + "rewards/format_reward/std": 0.4900552034378052, + "rewards/tag_count_reward/mean": 0.84716796875, + "rewards/tag_count_reward/std": 0.24472275376319885, "step": 2837 }, { @@ -82288,27 +82288,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.083984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1935.0, - "completions/mean_length": 787.3046875, - "completions/mean_terminated_length": 738.718017578125, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1118.322265625, + "completions/mean_terminated_length": 1033.0853271484375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.9688486814030896, - "grad_norm": 1.3993850946426392, - "kl": 6.453125, - "learning_rate": 1.0270024298465929e-07, - "loss": 0.4072, - "num_tokens": 1505843044.0, - "reward": 1.81982421875, - "reward_std": 0.5220248699188232, - "rewards/accuracy_reward/mean": 0.046875, - "rewards/accuracy_reward/std": 0.21157780289649963, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19834154844284058, + "grad_norm": 4.263636589050293, + "kl": 4.15625, + "learning_rate": 1.0270229006271222e-07, + "loss": 0.2146, + "num_tokens": 1638225366.0, + "reward": 1.33203125, + "reward_std": 0.6862621307373047, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.49434176087379456, + "rewards/tag_count_reward/mean": 0.833984375, + "rewards/tag_count_reward/std": 0.25417712330818176, "step": 2838 }, { @@ -82317,27 +82317,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 802.654296875, - "completions/mean_terminated_length": 744.0797119140625, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1137.435546875, + "completions/mean_terminated_length": 1058.1719970703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, "epoch": 0.9691900657164804, - "grad_norm": 1.2276525497436523, - "kl": 8.109375, - "learning_rate": 1.0264191827097057e-07, - "loss": 0.527, - "num_tokens": 1506330339.0, - "reward": 1.8525390625, - "reward_std": 0.5859047174453735, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.21028900146484375, + "grad_norm": 2.7013702392578125, + "kl": 3.9921875, + "learning_rate": 1.0264392117598772e-07, + "loss": 0.1759, + "num_tokens": 1638884069.0, + "reward": 1.38916015625, + "reward_std": 0.6746758222579956, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.435546875, + "rewards/format_reward/std": 0.49631330370903015, + "rewards/tag_count_reward/mean": 0.85791015625, + "rewards/tag_count_reward/std": 0.24421072006225586, "step": 2839 }, { @@ -82346,27 +82346,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 789.37109375, - "completions/mean_terminated_length": 738.207275390625, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1086.07421875, + "completions/mean_terminated_length": 1021.9458618164062, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.9695314500298712, - "grad_norm": 1.2961440086364746, - "kl": 7.0703125, - "learning_rate": 1.0258422850025601e-07, - "loss": 0.4479, - "num_tokens": 1506817233.0, - "reward": 1.8408203125, - "reward_std": 0.48050573468208313, - "rewards/accuracy_reward/mean": 0.0390625, - "rewards/accuracy_reward/std": 0.1939331740140915, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.1879453957080841, + "grad_norm": 3.3814971446990967, + "kl": 3.69921875, + "learning_rate": 1.02586187711231e-07, + "loss": 0.1626, + "num_tokens": 1639522875.0, + "reward": 1.31689453125, + "reward_std": 0.6733240485191345, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.41015625, + "rewards/format_reward/std": 0.49234291911125183, + "rewards/tag_count_reward/mean": 0.85595703125, + "rewards/tag_count_reward/std": 0.24154724180698395, "step": 2840 }, { @@ -82375,27 +82375,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1958.0, - "completions/mean_length": 739.642578125, - "completions/mean_terminated_length": 702.8613891601562, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1067.599609375, + "completions/mean_terminated_length": 970.8218994140625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.9698728343432619, - "grad_norm": 1.3730722665786743, - "kl": 6.4375, - "learning_rate": 1.025271737543956e-07, - "loss": 0.426, - "num_tokens": 1507276890.0, - "reward": 1.87060546875, - "reward_std": 0.4937661290168762, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93701171875, - "rewards/tag_count_reward/std": 0.1812174916267395, + "grad_norm": 3.728794813156128, + "kl": 4.0, + "learning_rate": 1.0252908975044645e-07, + "loss": 0.1748, + "num_tokens": 1640150446.0, + "reward": 1.3662109375, + "reward_std": 0.6644833087921143, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.404296875, + "rewards/format_reward/std": 0.4912354052066803, + "rewards/tag_count_reward/mean": 0.8623046875, + "rewards/tag_count_reward/std": 0.23478132486343384, "step": 2841 }, { @@ -82404,27 +82404,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 763.080078125, - "completions/mean_terminated_length": 729.605224609375, - "completions/min_length": 64.0, - "completions/min_terminated_length": 64.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1089.369140625, + "completions/mean_terminated_length": 1033.9111328125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.9702142186566527, - "grad_norm": 1.189055323600769, - "kl": 6.7109375, - "learning_rate": 1.0247075411436815e-07, - "loss": 0.4136, - "num_tokens": 1507753651.0, - "reward": 1.82861328125, - "reward_std": 0.491859495639801, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.18932512402534485, + "grad_norm": 1.847825288772583, + "kl": 3.90234375, + "learning_rate": 1.0247262737473563e-07, + "loss": 0.1608, + "num_tokens": 1640794267.0, + "reward": 1.33154296875, + "reward_std": 0.6865270137786865, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.412109375, + "rewards/format_reward/std": 0.49269601702690125, + "rewards/tag_count_reward/mean": 0.86083984375, + "rewards/tag_count_reward/std": 0.23728874325752258, "step": 2842 }, { @@ -82433,27 +82433,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1948.0, - "completions/mean_length": 765.400390625, - "completions/mean_terminated_length": 726.6901245117188, - "completions/min_length": 93.0, - "completions/min_terminated_length": 93.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1052.69921875, + "completions/mean_terminated_length": 1003.7499389648438, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.9705556029700435, - "grad_norm": 1.5975303649902344, - "kl": 6.0390625, - "learning_rate": 1.0241496966025103e-07, - "loss": 0.3615, - "num_tokens": 1508221024.0, - "reward": 1.91015625, - "reward_std": 0.5483856797218323, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.19278669357299805, + "grad_norm": 2.42067289352417, + "kl": 4.890625, + "learning_rate": 1.0241680066429735e-07, + "loss": 0.2124, + "num_tokens": 1641408737.0, + "reward": 1.435546875, + "reward_std": 0.6897906064987183, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.44921875, + "rewards/format_reward/std": 0.497901052236557, + "rewards/tag_count_reward/mean": 0.873046875, + "rewards/tag_count_reward/std": 0.23038743436336517, "step": 2843 }, { @@ -82462,27 +82462,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1983.0, - "completions/mean_length": 786.294921875, - "completions/mean_terminated_length": 740.3218994140625, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1121.724609375, + "completions/mean_terminated_length": 1062.0269775390625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, "epoch": 0.9708969872834343, - "grad_norm": 1.0943127870559692, - "kl": 6.7578125, - "learning_rate": 1.0235982047121997e-07, - "loss": 0.4222, - "num_tokens": 1508703735.0, - "reward": 1.83642578125, - "reward_std": 0.5712246298789978, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.845703125, - "rewards/format_reward/std": 0.36158639192581177, - "rewards/tag_count_reward/mean": 0.91845703125, - "rewards/tag_count_reward/std": 0.20408298075199127, + "grad_norm": 4.917789936065674, + "kl": 4.2890625, + "learning_rate": 1.0236160969842753e-07, + "loss": 0.152, + "num_tokens": 1642063188.0, + "reward": 1.3212890625, + "reward_std": 0.7013719081878662, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.396484375, + "rewards/format_reward/std": 0.4896455705165863, + "rewards/tag_count_reward/mean": 0.8349609375, + "rewards/tag_count_reward/std": 0.25094419717788696, "step": 2844 }, { @@ -82491,27 +82491,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 739.03125, - "completions/mean_terminated_length": 718.2540283203125, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1015.5859375, + "completions/mean_terminated_length": 962.5872802734375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.9712383715968251, - "grad_norm": 2.157069444656372, - "kl": 4.515625, - "learning_rate": 1.0230530662554937e-07, - "loss": 0.299, - "num_tokens": 1509163847.0, - "reward": 1.931640625, - "reward_std": 0.45786046981811523, - "rewards/accuracy_reward/mean": 0.078125, - "rewards/accuracy_reward/std": 0.26863065361976624, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.94921875, - "rewards/tag_count_reward/std": 0.16876834630966187, + "grad_norm": 2.033177375793457, + "kl": 3.83203125, + "learning_rate": 1.0230705455551917e-07, + "loss": 0.1485, + "num_tokens": 1642664896.0, + "reward": 1.3798828125, + "reward_std": 0.6810930967330933, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.4296875, + "rewards/format_reward/std": 0.4955156147480011, + "rewards/tag_count_reward/mean": 0.8603515625, + "rewards/tag_count_reward/std": 0.23570743203163147, "step": 2845 }, { @@ -82520,27 +82520,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1953.0, - "completions/mean_length": 725.2421875, - "completions/mean_terminated_length": 701.5745239257812, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 1061.58203125, + "completions/mean_terminated_length": 968.8419189453125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.971579755910216, - "grad_norm": 1.0849651098251343, - "kl": 5.2578125, - "learning_rate": 1.0225142820061143e-07, - "loss": 0.3072, - "num_tokens": 1509613459.0, - "reward": 1.875, - "reward_std": 0.43959301710128784, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.9453125, - "rewards/tag_count_reward/std": 0.16754092276096344, + "grad_norm": 1.648969292640686, + "kl": 5.1640625, + "learning_rate": 1.0225313531306198e-07, + "loss": 0.2459, + "num_tokens": 1643286714.0, + "reward": 1.359375, + "reward_std": 0.6877593994140625, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.44140625, + "rewards/format_reward/std": 0.4970405399799347, + "rewards/tag_count_reward/mean": 0.853515625, + "rewards/tag_count_reward/std": 0.24083462357521057, "step": 2846 }, { @@ -82549,27 +82549,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 759.966796875, - "completions/mean_terminated_length": 710.3265380859375, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1001.361328125, + "completions/mean_terminated_length": 929.2546997070312, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.9719211402236068, - "grad_norm": 0.8330246806144714, - "kl": 7.390625, - "learning_rate": 1.0219818527287691e-07, - "loss": 0.4548, - "num_tokens": 1510080834.0, - "reward": 1.87548828125, - "reward_std": 0.5958338379859924, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.1997050791978836, + "grad_norm": 6.081742286682129, + "kl": 4.93359375, + "learning_rate": 1.0219985204764262e-07, + "loss": 0.2209, + "num_tokens": 1643877683.0, + "reward": 1.35546875, + "reward_std": 0.7026700973510742, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.37890625, + "rewards/format_reward/std": 0.4855891764163971, + "rewards/tag_count_reward/mean": 0.845703125, + "rewards/tag_count_reward/std": 0.25048112869262695, "step": 2847 }, { @@ -82578,27 +82578,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1832.0, - "completions/mean_length": 773.40234375, - "completions/mean_terminated_length": 732.2862548828125, - "completions/min_length": 65.0, - "completions/min_terminated_length": 65.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1057.79296875, + "completions/mean_terminated_length": 991.7792358398438, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.9722625245369976, - "grad_norm": 1.3064340353012085, - "kl": 7.0234375, - "learning_rate": 1.0214557791791425e-07, - "loss": 0.4567, - "num_tokens": 1510559184.0, - "reward": 1.90576171875, - "reward_std": 0.5227937698364258, - "rewards/accuracy_reward/mean": 0.1015625, - "rewards/accuracy_reward/std": 0.30236753821372986, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.18187542259693146, + "grad_norm": 1.79745614528656, + "kl": 4.734375, + "learning_rate": 1.0214720483494425e-07, + "loss": 0.2126, + "num_tokens": 1644501641.0, + "reward": 1.4091796875, + "reward_std": 0.7045381665229797, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.458984375, + "rewards/format_reward/std": 0.49880221486091614, + "rewards/tag_count_reward/mean": 0.8447265625, + "rewards/tag_count_reward/std": 0.24889487028121948, "step": 2848 }, { @@ -82607,27 +82607,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1946.0, - "completions/mean_length": 811.189453125, - "completions/mean_terminated_length": 778.9679565429688, - "completions/min_length": 120.0, - "completions/min_terminated_length": 120.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1130.638671875, + "completions/mean_terminated_length": 1063.3270263671875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.9726039088503883, - "grad_norm": 1.6328932046890259, - "kl": 6.5234375, - "learning_rate": 1.0209360621039007e-07, - "loss": 0.4096, - "num_tokens": 1511060273.0, - "reward": 1.81884765625, - "reward_std": 0.533811092376709, - "rewards/accuracy_reward/mean": 0.04435483738780022, - "rewards/accuracy_reward/std": 0.2060900777578354, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92822265625, - "rewards/tag_count_reward/std": 0.18597961962223053, + "grad_norm": 2.316469669342041, + "kl": 4.7578125, + "learning_rate": 1.0209519374974673e-07, + "loss": 0.2448, + "num_tokens": 1645166288.0, + "reward": 1.3291015625, + "reward_std": 0.7262699604034424, + "rewards/accuracy_reward/mean": 0.07459677755832672, + "rewards/accuracy_reward/std": 0.263004869222641, + "rewards/format_reward/mean": 0.4296875, + "rewards/format_reward/std": 0.4955156147480011, + "rewards/tag_count_reward/mean": 0.8271484375, + "rewards/tag_count_reward/std": 0.2666294574737549, "step": 2849 }, { @@ -82636,27 +82636,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1974.0, - "completions/mean_length": 797.998046875, - "completions/mean_terminated_length": 765.432861328125, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1098.19140625, + "completions/mean_terminated_length": 1032.7557373046875, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, "epoch": 0.9729452931637791, - "grad_norm": 2.272385358810425, - "kl": 6.9765625, - "learning_rate": 1.0204227022406866e-07, - "loss": 0.4145, - "num_tokens": 1511546640.0, - "reward": 1.92626953125, - "reward_std": 0.5484147071838379, - "rewards/accuracy_reward/mean": 0.126953125, - "rewards/accuracy_reward/std": 0.33324605226516724, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.93212890625, - "rewards/tag_count_reward/std": 0.18874379992485046, + "grad_norm": 4.359287261962891, + "kl": 5.09375, + "learning_rate": 1.0204381886592631e-07, + "loss": 0.2407, + "num_tokens": 1645806354.0, + "reward": 1.318359375, + "reward_std": 0.7137932777404785, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.39453125, + "rewards/format_reward/std": 0.4892277717590332, + "rewards/tag_count_reward/mean": 0.826171875, + "rewards/tag_count_reward/std": 0.26553234457969666, "step": 2850 }, { @@ -82665,27 +82665,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 776.94140625, - "completions/mean_terminated_length": 733.2889404296875, - "completions/min_length": 91.0, - "completions/min_terminated_length": 91.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1104.623046875, + "completions/mean_terminated_length": 1039.6304931640625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.9732866774771699, - "grad_norm": 1.8062188625335693, - "kl": 6.7109375, - "learning_rate": 1.019915700318121e-07, - "loss": 0.3848, - "num_tokens": 1512033570.0, - "reward": 1.8427734375, - "reward_std": 0.5338236093521118, + "grad_norm": 1.8686693906784058, + "kl": 4.0078125, + "learning_rate": 1.0199308025645555e-07, + "loss": 0.1525, + "num_tokens": 1646461057.0, + "reward": 1.38720703125, + "reward_std": 0.6902011632919312, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.9169921875, - "rewards/tag_count_reward/std": 0.21086981892585754, + "rewards/format_reward/mean": 0.455078125, + "rewards/format_reward/std": 0.4984649419784546, + "rewards/tag_count_reward/mean": 0.85009765625, + "rewards/tag_count_reward/std": 0.24801558256149292, "step": 2851 }, { @@ -82694,27 +82694,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1883.0, - "completions/mean_length": 815.73046875, - "completions/mean_terminated_length": 760.404052734375, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1100.45703125, + "completions/mean_terminated_length": 1028.794189453125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.9736280617905607, - "grad_norm": 3.596667528152466, - "kl": 9.1953125, - "learning_rate": 1.0194150570558e-07, - "loss": 0.5622, - "num_tokens": 1512537224.0, - "reward": 1.80029296875, - "reward_std": 0.6002273559570312, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.818359375, - "rewards/format_reward/std": 0.38592514395713806, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.21251004934310913, + "grad_norm": 1.9374486207962036, + "kl": 4.16796875, + "learning_rate": 1.0194297799340319e-07, + "loss": 0.1853, + "num_tokens": 1647110491.0, + "reward": 1.349609375, + "reward_std": 0.6790359616279602, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.41015625, + "rewards/format_reward/std": 0.49234291911125183, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.2461792230606079, "step": 2852 }, { @@ -82723,27 +82723,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.060546875, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1945.0, - "completions/mean_length": 808.33203125, - "completions/mean_terminated_length": 728.4365844726562, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1069.001953125, + "completions/mean_terminated_length": 1001.5553588867188, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, "epoch": 0.9739694461039515, - "grad_norm": 3.8079118728637695, - "kl": 11.296875, - "learning_rate": 1.0189207731642956e-07, - "loss": 0.6901, - "num_tokens": 1513032450.0, - "reward": 1.8154296875, - "reward_std": 0.6169764995574951, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.814453125, - "rewards/format_reward/std": 0.38912075757980347, - "rewards/tag_count_reward/mean": 0.9052734375, - "rewards/tag_count_reward/std": 0.21965888142585754, + "grad_norm": 4.581700801849365, + "kl": 4.55078125, + "learning_rate": 1.0189351214793437e-07, + "loss": 0.2019, + "num_tokens": 1647739180.0, + "reward": 1.36181640625, + "reward_std": 0.660216212272644, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.416015625, + "rewards/format_reward/std": 0.493378221988678, + "rewards/tag_count_reward/mean": 0.84619140625, + "rewards/tag_count_reward/std": 0.24310480058193207, "step": 2853 }, { @@ -82752,27 +82752,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1882.0, - "completions/mean_length": 798.49609375, - "completions/mean_terminated_length": 750.3407592773438, - "completions/min_length": 85.0, - "completions/min_terminated_length": 85.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1096.3515625, + "completions/mean_terminated_length": 1013.51171875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.9743108304173423, - "grad_norm": 1.598483681678772, - "kl": 7.2265625, - "learning_rate": 1.0184328493451527e-07, - "loss": 0.4716, - "num_tokens": 1513523904.0, - "reward": 1.81005859375, - "reward_std": 0.4909866154193878, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19460642337799072, + "grad_norm": 2.6207141876220703, + "kl": 4.6484375, + "learning_rate": 1.0184468279030992e-07, + "loss": 0.2122, + "num_tokens": 1648383136.0, + "reward": 1.2822265625, + "reward_std": 0.6821188926696777, + "rewards/accuracy_reward/mean": 0.029296875, + "rewards/accuracy_reward/std": 0.16880230605602264, + "rewards/format_reward/mean": 0.412109375, + "rewards/format_reward/std": 0.49269601702690125, + "rewards/tag_count_reward/mean": 0.8408203125, + "rewards/tag_count_reward/std": 0.25229611992836, "step": 2854 }, { @@ -82781,27 +82781,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01171875, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 740.95703125, - "completions/mean_terminated_length": 725.4585571289062, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1013.330078125, + "completions/mean_terminated_length": 962.4446411132812, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.9746522147307332, - "grad_norm": 2.422755002975464, - "kl": 5.1875, - "learning_rate": 1.0179512862908905e-07, - "loss": 0.3305, - "num_tokens": 1513980282.0, - "reward": 1.892578125, - "reward_std": 0.48154059052467346, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.16541723906993866, + "grad_norm": 1.7399166822433472, + "kl": 4.52734375, + "learning_rate": 1.0179648998988694e-07, + "loss": 0.2159, + "num_tokens": 1648978969.0, + "reward": 1.36865234375, + "reward_std": 0.712919294834137, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.423828125, + "rewards/format_reward/std": 0.4946470856666565, + "rewards/tag_count_reward/mean": 0.84716796875, + "rewards/tag_count_reward/std": 0.24472275376319885, "step": 2855 }, { @@ -82810,27 +82810,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 754.169921875, - "completions/mean_terminated_length": 684.95263671875, - "completions/min_length": 58.0, - "completions/min_terminated_length": 58.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1048.41015625, + "completions/mean_terminated_length": 970.5473022460938, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, "epoch": 0.974993599044124, - "grad_norm": 1.3403568267822266, - "kl": 6.9609375, - "learning_rate": 1.0174760846849994e-07, - "loss": 0.4621, - "num_tokens": 1514442305.0, - "reward": 1.89453125, - "reward_std": 0.5234760046005249, - "rewards/accuracy_reward/mean": 0.111328125, - "rewards/accuracy_reward/std": 0.31484565138816833, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.919921875, - "rewards/tag_count_reward/std": 0.21026401221752167, + "grad_norm": 2.0213963985443115, + "kl": 5.03125, + "learning_rate": 1.0174893381511803e-07, + "loss": 0.2574, + "num_tokens": 1649591643.0, + "reward": 1.34033203125, + "reward_std": 0.7047779560089111, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.4140625, + "rewards/format_reward/std": 0.49304109811782837, + "rewards/tag_count_reward/mean": 0.83251953125, + "rewards/tag_count_reward/std": 0.25441715121269226, "step": 2856 }, { @@ -82839,27 +82839,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1935.0, - "completions/mean_length": 711.4296875, - "completions/mean_terminated_length": 673.8554077148438, - "completions/min_length": 95.0, - "completions/min_terminated_length": 95.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1066.361328125, + "completions/mean_terminated_length": 971.7708740234375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, "epoch": 0.9753349833575147, - "grad_norm": 1.6096361875534058, - "kl": 5.5625, - "learning_rate": 1.0170072452019414e-07, - "loss": 0.3562, - "num_tokens": 1514891245.0, - "reward": 1.931640625, - "reward_std": 0.5143195986747742, - "rewards/accuracy_reward/mean": 0.119140625, - "rewards/accuracy_reward/std": 0.32427072525024414, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.9375, - "rewards/tag_count_reward/std": 0.18373169004917145, + "grad_norm": 1.7970101833343506, + "kl": 4.8515625, + "learning_rate": 1.0170201433355185e-07, + "loss": 0.2311, + "num_tokens": 1650222308.0, + "reward": 1.3046875, + "reward_std": 0.7138506770133972, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.37890625, + "rewards/format_reward/std": 0.4855891764163971, + "rewards/tag_count_reward/mean": 0.826171875, + "rewards/tag_count_reward/std": 0.26135390996932983, "step": 2857 }, { @@ -82868,27 +82868,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 763.240234375, - "completions/mean_terminated_length": 702.8118286132812, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1065.8515625, + "completions/mean_terminated_length": 987.1138916015625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.9756763676709055, - "grad_norm": 1.431836724281311, - "kl": 7.6875, - "learning_rate": 1.0165447685071481e-07, - "loss": 0.4733, - "num_tokens": 1515357976.0, - "reward": 1.77490234375, - "reward_std": 0.5884698033332825, - "rewards/accuracy_reward/mean": 0.056640625, - "rewards/accuracy_reward/std": 0.23138070106506348, - "rewards/format_reward/mean": 0.8125, - "rewards/format_reward/std": 0.39069411158561707, - "rewards/tag_count_reward/mean": 0.90576171875, - "rewards/tag_count_reward/std": 0.21791352331638336, + "grad_norm": 2.336575508117676, + "kl": 4.5390625, + "learning_rate": 1.0165573161183246e-07, + "loss": 0.2254, + "num_tokens": 1650843976.0, + "reward": 1.34765625, + "reward_std": 0.6703216433525085, + "rewards/accuracy_reward/mean": 0.080078125, + "rewards/accuracy_reward/std": 0.271679550409317, + "rewards/format_reward/mean": 0.4296875, + "rewards/format_reward/std": 0.4955156147480011, + "rewards/tag_count_reward/mean": 0.837890625, + "rewards/tag_count_reward/std": 0.25669097900390625, "step": 2858 }, { @@ -82897,27 +82897,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1908.0, - "completions/mean_length": 768.263671875, - "completions/mean_terminated_length": 713.529541015625, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1100.978515625, + "completions/mean_terminated_length": 1037.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, "epoch": 0.9760177519842963, - "grad_norm": 1.0789576768875122, - "kl": 6.25390625, - "learning_rate": 1.0160886552570211e-07, - "loss": 0.4158, - "num_tokens": 1515835359.0, - "reward": 1.84130859375, - "reward_std": 0.5203293561935425, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.18583056330680847, + "grad_norm": 3.625345230102539, + "kl": 4.2421875, + "learning_rate": 1.016100857156996e-07, + "loss": 0.1964, + "num_tokens": 1651491709.0, + "reward": 1.33203125, + "reward_std": 0.7000211477279663, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.4296875, + "rewards/format_reward/std": 0.4955156147480011, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.24377650022506714, "step": 2859 }, { @@ -82926,27 +82926,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 752.947265625, - "completions/mean_terminated_length": 734.9960327148438, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1104.259765625, + "completions/mean_terminated_length": 1037.1317138671875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.9763591362976871, - "grad_norm": 1.5350780487060547, - "kl": 5.265625, - "learning_rate": 1.0156389060989289e-07, - "loss": 0.316, - "num_tokens": 1516295796.0, - "reward": 1.85546875, - "reward_std": 0.47244903445243835, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.17968250811100006, + "grad_norm": 4.727302551269531, + "kl": 3.3359375, + "learning_rate": 1.0156507670998842e-07, + "loss": 0.134, + "num_tokens": 1652132018.0, + "reward": 1.44091796875, + "reward_std": 0.6813417673110962, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.50390625, + "rewards/format_reward/std": 0.5004737377166748, + "rewards/tag_count_reward/mean": 0.87255859375, + "rewards/tag_count_reward/std": 0.22555410861968994, "step": 2860 }, { @@ -82955,27 +82955,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1954.0, - "completions/mean_length": 716.34375, - "completions/mean_terminated_length": 684.384033203125, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1024.431640625, + "completions/mean_terminated_length": 953.9144287109375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.9767005206110779, - "grad_norm": 2.4552807807922363, - "kl": 5.6875, - "learning_rate": 1.0151955216712089e-07, - "loss": 0.387, - "num_tokens": 1516737892.0, - "reward": 1.90966796875, - "reward_std": 0.5338730812072754, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.93505859375, - "rewards/tag_count_reward/std": 0.18187542259693146, + "grad_norm": 3.09757924079895, + "kl": 4.69140625, + "learning_rate": 1.0152070465862951e-07, + "loss": 0.2032, + "num_tokens": 1652731855.0, + "reward": 1.3427734375, + "reward_std": 0.6981798410415649, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.431640625, + "rewards/format_reward/std": 0.4957893490791321, + "rewards/tag_count_reward/mean": 0.8408203125, + "rewards/tag_count_reward/std": 0.24690592288970947, "step": 2861 }, { @@ -82984,27 +82984,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 753.84375, - "completions/mean_terminated_length": 725.4291381835938, - "completions/min_length": 86.0, - "completions/min_terminated_length": 86.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1107.91796875, + "completions/mean_terminated_length": 1026.0849609375, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.9770419049244687, - "grad_norm": 1.608132004737854, - "kl": 4.703125, - "learning_rate": 1.0147585026031632e-07, - "loss": 0.3193, - "num_tokens": 1517204996.0, - "reward": 1.900390625, - "reward_std": 0.4727388918399811, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.16840559244155884, + "grad_norm": 2.9012365341186523, + "kl": 4.4140625, + "learning_rate": 1.0147696962464861e-07, + "loss": 0.2215, + "num_tokens": 1653380245.0, + "reward": 1.33837890625, + "reward_std": 0.7225753664970398, + "rewards/accuracy_reward/mean": 0.064453125, + "rewards/accuracy_reward/std": 0.24579854309558868, + "rewards/format_reward/mean": 0.447265625, + "rewards/format_reward/std": 0.4976975917816162, + "rewards/tag_count_reward/mean": 0.82666015625, + "rewards/tag_count_reward/std": 0.2595668137073517, "step": 2862 }, { @@ -83013,27 +83013,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1962.0, - "completions/mean_length": 729.15234375, - "completions/mean_terminated_length": 697.5000610351562, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1079.482421875, + "completions/mean_terminated_length": 1029.763916015625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.9773832892378596, - "grad_norm": 2.0549094676971436, - "kl": 4.421875, - "learning_rate": 1.0143278495150619e-07, - "loss": 0.3088, - "num_tokens": 1517655778.0, - "reward": 1.923828125, - "reward_std": 0.45604830980300903, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.8984375, - "rewards/format_reward/std": 0.30236753821372986, - "rewards/tag_count_reward/mean": 0.94921875, - "rewards/tag_count_reward/std": 0.16361670196056366, + "grad_norm": 2.338446617126465, + "kl": 4.78125, + "learning_rate": 1.0143387167016674e-07, + "loss": 0.2336, + "num_tokens": 1654010396.0, + "reward": 1.36328125, + "reward_std": 0.6864089369773865, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.49434176087379456, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.24972465634346008, "step": 2863 }, { @@ -83042,27 +83042,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1929.0, - "completions/mean_length": 786.349609375, - "completions/mean_terminated_length": 756.0700073242188, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1116.798828125, + "completions/mean_terminated_length": 1050.562744140625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.9777246735512504, - "grad_norm": 0.9248939752578735, - "kl": 5.64453125, - "learning_rate": 1.0139035630181373e-07, - "loss": 0.3801, - "num_tokens": 1518136645.0, - "reward": 1.91796875, - "reward_std": 0.5349627733230591, - "rewards/accuracy_reward/mean": 0.115234375, - "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.17965060472488403, + "grad_norm": 2.406902551651001, + "kl": 4.09765625, + "learning_rate": 1.0139141085639992e-07, + "loss": 0.1757, + "num_tokens": 1654660453.0, + "reward": 1.4404296875, + "reward_std": 0.7518150806427002, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.501953125, + "rewards/format_reward/std": 0.5004851818084717, + "rewards/tag_count_reward/mean": 0.8310546875, + "rewards/tag_count_reward/std": 0.2603483498096466, "step": 2864 }, { @@ -83071,27 +83071,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 752.0703125, - "completions/mean_terminated_length": 718.3086547851562, - "completions/min_length": 89.0, - "completions/min_terminated_length": 89.0, - "epoch": 0.9780660578646411, - "grad_norm": 1.2031464576721191, - "kl": 4.890625, - "learning_rate": 1.0134856437145871e-07, - "loss": 0.3152, - "num_tokens": 1518591289.0, - "reward": 1.90087890625, - "reward_std": 0.5522708892822266, - "rewards/accuracy_reward/mean": 0.109375, - "rewards/accuracy_reward/std": 0.31241437792778015, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.1918918341398239, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1085.56640625, + "completions/mean_terminated_length": 992.8265380859375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.9780660578646411, + "grad_norm": 1.7805724143981934, + "kl": 4.25, + "learning_rate": 1.0134958724365934e-07, + "loss": 0.2031, + "num_tokens": 1655285847.0, + "reward": 1.38623046875, + "reward_std": 0.7169825434684753, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.451171875, + "rewards/format_reward/std": 0.498096764087677, + "rewards/tag_count_reward/mean": 0.83349609375, + "rewards/tag_count_reward/std": 0.2536158859729767, "step": 2865 }, { @@ -83100,27 +83100,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1955.0, - "completions/mean_length": 767.494140625, - "completions/mean_terminated_length": 726.1875, - "completions/min_length": 27.0, - "completions/min_terminated_length": 27.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1144.73046875, + "completions/mean_terminated_length": 1036.0218505859375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, "epoch": 0.9784074421780319, - "grad_norm": 1.9690607786178589, - "kl": 4.76953125, - "learning_rate": 1.0130740921975706e-07, - "loss": 0.3667, - "num_tokens": 1519071318.0, - "reward": 1.9267578125, - "reward_std": 0.48103824257850647, - "rewards/accuracy_reward/mean": 0.10685484111309052, - "rewards/accuracy_reward/std": 0.30924052000045776, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.9423828125, - "rewards/tag_count_reward/std": 0.17374257743358612, + "grad_norm": 2.1228413581848145, + "kl": 5.125, + "learning_rate": 1.01308400891351e-07, + "loss": 0.2461, + "num_tokens": 1655959021.0, + "reward": 1.33056640625, + "reward_std": 0.7182646989822388, + "rewards/accuracy_reward/mean": 0.08064515888690948, + "rewards/accuracy_reward/std": 0.2725643217563629, + "rewards/format_reward/mean": 0.427734375, + "rewards/format_reward/std": 0.4952339828014374, + "rewards/tag_count_reward/mean": 0.82470703125, + "rewards/tag_count_reward/std": 0.25966617465019226, "step": 2866 }, { @@ -83129,27 +83129,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2026.0, - "completions/mean_length": 843.419921875, - "completions/mean_terminated_length": 799.5283813476562, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1160.169921875, + "completions/mean_terminated_length": 1106.8634033203125, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, "epoch": 0.9787488264914227, - "grad_norm": 1.9435549974441528, - "kl": 7.453125, - "learning_rate": 1.0126689090512102e-07, - "loss": 0.4961, - "num_tokens": 1519578557.0, - "reward": 1.857421875, - "reward_std": 0.550590455532074, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.19933690130710602, + "grad_norm": 3.094758987426758, + "kl": 3.78125, + "learning_rate": 1.0126785185797567e-07, + "loss": 0.1167, + "num_tokens": 1656628436.0, + "reward": 1.45458984375, + "reward_std": 0.7193611860275269, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.505859375, + "rewards/format_reward/std": 0.5004546642303467, + "rewards/tag_count_reward/mean": 0.85498046875, + "rewards/tag_count_reward/std": 0.2384096086025238, "step": 2867 }, { @@ -83158,27 +83158,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1998.0, - "completions/mean_length": 740.43359375, - "completions/mean_terminated_length": 709.0520629882812, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1095.970703125, + "completions/mean_terminated_length": 1028.2530517578125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, "epoch": 0.9790902108048135, - "grad_norm": 1.3769993782043457, - "kl": 5.890625, - "learning_rate": 1.0122700948505894e-07, - "loss": 0.3869, - "num_tokens": 1520039627.0, - "reward": 1.93310546875, - "reward_std": 0.5323408842086792, - "rewards/accuracy_reward/mean": 0.11088709533214569, - "rewards/accuracy_reward/std": 0.3143092691898346, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.17570249736309052, + "grad_norm": 2.7481472492218018, + "kl": 4.37890625, + "learning_rate": 1.01227940201129e-07, + "loss": 0.2079, + "num_tokens": 1657271541.0, + "reward": 1.3837890625, + "reward_std": 0.7567757964134216, + "rewards/accuracy_reward/mean": 0.12298387289047241, + "rewards/accuracy_reward/std": 0.32875028252601624, + "rewards/format_reward/mean": 0.435546875, + "rewards/format_reward/std": 0.49631330370903015, + "rewards/tag_count_reward/mean": 0.8291015625, + "rewards/tag_count_reward/std": 0.25859543681144714, "step": 2868 }, { @@ -83187,27 +83187,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1829.0, - "completions/mean_length": 773.1796875, - "completions/mean_terminated_length": 715.9428100585938, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1061.5703125, + "completions/mean_terminated_length": 993.6116943359375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, "epoch": 0.9794315951182043, - "grad_norm": 2.0342190265655518, - "kl": 8.0078125, - "learning_rate": 1.0118776501617519e-07, - "loss": 0.5134, - "num_tokens": 1520508999.0, - "reward": 1.85107421875, - "reward_std": 0.5643364191055298, - "rewards/accuracy_reward/mean": 0.0703125, - "rewards/accuracy_reward/std": 0.25592297315597534, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.92529296875, - "rewards/tag_count_reward/std": 0.19574713706970215, + "grad_norm": 2.116703510284424, + "kl": 4.3046875, + "learning_rate": 1.0118866597750134e-07, + "loss": 0.1898, + "num_tokens": 1657888569.0, + "reward": 1.45947265625, + "reward_std": 0.7092756032943726, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.513671875, + "rewards/format_reward/std": 0.5003018379211426, + "rewards/tag_count_reward/mean": 0.85400390625, + "rewards/tag_count_reward/std": 0.24540509283542633, "step": 2869 }, { @@ -83216,27 +83216,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1845.0, - "completions/mean_length": 710.8828125, - "completions/mean_terminated_length": 673.2931518554688, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1057.89453125, + "completions/mean_terminated_length": 980.7705078125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.9797729794315951, - "grad_norm": 0.9224192500114441, - "kl": 6.6953125, - "learning_rate": 1.0114915755417014e-07, - "loss": 0.4794, - "num_tokens": 1520960203.0, - "reward": 1.88720703125, - "reward_std": 0.4814002513885498, - "rewards/accuracy_reward/mean": 0.06653226166963577, - "rewards/accuracy_reward/std": 0.2494617998600006, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17180897295475006, + "grad_norm": 2.7645609378814697, + "kl": 4.390625, + "learning_rate": 1.011500292428775e-07, + "loss": 0.2122, + "num_tokens": 1658517443.0, + "reward": 1.3759765625, + "reward_std": 0.6637743711471558, + "rewards/accuracy_reward/mean": 0.08266129344701767, + "rewards/accuracy_reward/std": 0.2756475806236267, + "rewards/format_reward/mean": 0.44921875, + "rewards/format_reward/std": 0.497901052236557, + "rewards/tag_count_reward/mean": 0.8466796875, + "rewards/tag_count_reward/std": 0.2466581165790558, "step": 2870 }, { @@ -83245,27 +83245,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.087890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1862.0, - "completions/mean_length": 773.857421875, - "completions/mean_terminated_length": 730.0989990234375, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1116.96875, + "completions/mean_terminated_length": 1027.2547607421875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.980114363744986, - "grad_norm": 3.156883478164673, - "kl": 7.5, - "learning_rate": 1.0111118715383995e-07, - "loss": 0.4313, - "num_tokens": 1521435682.0, - "reward": 1.82568359375, - "reward_std": 0.5020943284034729, - "rewards/accuracy_reward/mean": 0.05078125, - "rewards/accuracy_reward/std": 0.21976542472839355, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.92724609375, - "rewards/tag_count_reward/std": 0.19772392511367798, + "grad_norm": 2.9820592403411865, + "kl": 5.0546875, + "learning_rate": 1.0111203005213692e-07, + "loss": 0.2683, + "num_tokens": 1659168595.0, + "reward": 1.3203125, + "reward_std": 0.6986395120620728, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, + "rewards/format_reward/mean": 0.44140625, + "rewards/format_reward/std": 0.4970405399799347, + "rewards/tag_count_reward/mean": 0.833984375, + "rewards/tag_count_reward/std": 0.26455172896385193, "step": 2871 }, { @@ -83274,27 +83274,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.009765625, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 695.564453125, - "completions/mean_terminated_length": 682.226806640625, - "completions/min_length": 87.0, - "completions/min_terminated_length": 87.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1054.677734375, + "completions/mean_terminated_length": 1005.8257446289062, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.9804557480583768, - "grad_norm": 1.7067991495132446, - "kl": 5.765625, - "learning_rate": 1.0107385386907679e-07, - "loss": 0.3465, - "num_tokens": 1521868387.0, - "reward": 1.9130859375, - "reward_std": 0.4521946907043457, - "rewards/accuracy_reward/mean": 0.080078125, - "rewards/accuracy_reward/std": 0.271679550409317, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.9462890625, - "rewards/tag_count_reward/std": 0.16565679013729095, + "grad_norm": 1.3698443174362183, + "kl": 4.43359375, + "learning_rate": 1.0107466845925335e-07, + "loss": 0.1976, + "num_tokens": 1659785166.0, + "reward": 1.41943359375, + "reward_std": 0.7365034222602844, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.501953125, + "rewards/format_reward/std": 0.5004851818084717, + "rewards/tag_count_reward/mean": 0.84326171875, + "rewards/tag_count_reward/std": 0.24920323491096497, "step": 2872 }, { @@ -83303,27 +83303,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 695.802734375, - "completions/mean_terminated_length": 668.8665771484375, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1023.806640625, + "completions/mean_terminated_length": 966.7897338867188, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.9807971323717675, - "grad_norm": 1.465183138847351, - "kl": 7.10546875, - "learning_rate": 1.0103715775286826e-07, - "loss": 0.4461, - "num_tokens": 1522299438.0, - "reward": 1.90576171875, - "reward_std": 0.500202476978302, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.94287109375, - "rewards/tag_count_reward/std": 0.1663554310798645, + "grad_norm": 2.4540865421295166, + "kl": 5.06640625, + "learning_rate": 1.0103794451729503e-07, + "loss": 0.2646, + "num_tokens": 1660384155.0, + "reward": 1.38525390625, + "reward_std": 0.6727242469787598, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.458984375, + "rewards/format_reward/std": 0.49880221486091614, + "rewards/tag_count_reward/mean": 0.85595703125, + "rewards/tag_count_reward/std": 0.24104034900665283, "step": 2873 }, { @@ -83332,27 +83332,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0546875, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 757.095703125, - "completions/mean_terminated_length": 682.415283203125, - "completions/min_length": 64.0, - "completions/min_terminated_length": 64.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1044.15234375, + "completions/mean_terminated_length": 972.7489013671875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.9811385166851583, - "grad_norm": 1.1074867248535156, - "kl": 8.34375, - "learning_rate": 1.010010988572979e-07, - "loss": 0.5809, - "num_tokens": 1522761615.0, - "reward": 1.89892578125, - "reward_std": 0.5496411323547363, - "rewards/accuracy_reward/mean": 0.123046875, - "rewards/accuracy_reward/std": 0.32881227135658264, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92236328125, - "rewards/tag_count_reward/std": 0.20260746777057648, + "grad_norm": 3.191431999206543, + "kl": 5.3125, + "learning_rate": 1.0100185827842445e-07, + "loss": 0.2621, + "num_tokens": 1660993305.0, + "reward": 1.3095703125, + "reward_std": 0.70173180103302, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.412109375, + "rewards/format_reward/std": 0.49269601702690125, + "rewards/tag_count_reward/mean": 0.8134765625, + "rewards/tag_count_reward/std": 0.2624102234840393, "step": 2874 }, { @@ -83361,27 +83361,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.091796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 783.8515625, - "completions/mean_terminated_length": 761.2325439453125, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1146.390625, + "completions/mean_terminated_length": 1055.26025390625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.9814799009985491, - "grad_norm": 1.0652945041656494, - "kl": 6.4921875, - "learning_rate": 1.0096567723354473e-07, - "loss": 0.4239, - "num_tokens": 1523248979.0, - "reward": 1.9462890625, - "reward_std": 0.5402163863182068, - "rewards/accuracy_reward/mean": 0.14717741310596466, - "rewards/accuracy_reward/std": 0.3546403646469116, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.1779806911945343, + "grad_norm": 1.661904215812683, + "kl": 4.5078125, + "learning_rate": 1.009664097938983e-07, + "loss": 0.2103, + "num_tokens": 1661666289.0, + "reward": 1.42138671875, + "reward_std": 0.7657043933868408, + "rewards/accuracy_reward/mean": 0.12298387289047241, + "rewards/accuracy_reward/std": 0.32875028252601624, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.4995105266571045, + "rewards/tag_count_reward/mean": 0.83349609375, + "rewards/tag_count_reward/std": 0.259809672832489, "step": 2875 }, { @@ -83390,27 +83390,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1934.0, - "completions/mean_length": 786.330078125, - "completions/mean_terminated_length": 743.0000610351562, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1152.697265625, + "completions/mean_terminated_length": 1074.76220703125, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.9818212853119399, - "grad_norm": 1.4906625747680664, - "kl": 6.2265625, - "learning_rate": 1.0093089293188319e-07, - "loss": 0.3868, - "num_tokens": 1523725580.0, - "reward": 1.86181640625, - "reward_std": 0.5163675546646118, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.18606694042682648, + "grad_norm": 1.9984285831451416, + "kl": 5.171875, + "learning_rate": 1.0093159911406735e-07, + "loss": 0.2399, + "num_tokens": 1662330470.0, + "reward": 1.35888671875, + "reward_std": 0.701007068157196, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.46484375, + "rewards/format_reward/std": 0.49925029277801514, + "rewards/tag_count_reward/mean": 0.83544921875, + "rewards/tag_count_reward/std": 0.24460558593273163, "step": 2876 }, { @@ -83419,27 +83419,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2014.0, - "completions/mean_length": 720.775390625, - "completions/mean_terminated_length": 699.7083740234375, - "completions/min_length": 89.0, - "completions/min_terminated_length": 89.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1085.359375, + "completions/mean_terminated_length": 1023.318115234375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.9821626696253307, - "grad_norm": 2.050997495651245, - "kl": 4.58984375, - "learning_rate": 1.0089674600168329e-07, - "loss": 0.3026, - "num_tokens": 1524170697.0, - "reward": 1.88720703125, - "reward_std": 0.48582902550697327, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.93994140625, - "rewards/tag_count_reward/std": 0.18018634617328644, + "grad_norm": 3.819279432296753, + "kl": 4.95703125, + "learning_rate": 1.0089742628837653e-07, + "loss": 0.2147, + "num_tokens": 1662962254.0, + "reward": 1.388671875, + "reward_std": 0.7131758332252502, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.462890625, + "rewards/format_reward/std": 0.4991086423397064, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.2402387410402298, "step": 2877 }, { @@ -83448,27 +83448,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.013671875, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 719.587890625, - "completions/mean_terminated_length": 701.1742553710938, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1065.890625, + "completions/mean_terminated_length": 1015.474365234375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.9825040539387215, - "grad_norm": 1.5940486192703247, - "kl": 3.71875, - "learning_rate": 1.0086323649141032e-07, - "loss": 0.253, - "num_tokens": 1524615382.0, - "reward": 1.9189453125, - "reward_std": 0.4246072769165039, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.90625, - "rewards/format_reward/std": 0.29176566004753113, - "rewards/tag_count_reward/mean": 0.9580078125, - "rewards/tag_count_reward/std": 0.1448541134595871, + "grad_norm": 2.106335401535034, + "kl": 4.5390625, + "learning_rate": 1.0086389136536468e-07, + "loss": 0.2047, + "num_tokens": 1663584246.0, + "reward": 1.4228515625, + "reward_std": 0.6858218312263489, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.494140625, + "rewards/format_reward/std": 0.5004546642303467, + "rewards/tag_count_reward/mean": 0.8564453125, + "rewards/tag_count_reward/std": 0.24460412561893463, "step": 2878 }, { @@ -83477,27 +83477,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1929.0, - "completions/mean_length": 691.673828125, - "completions/mean_terminated_length": 647.9213256835938, - "completions/min_length": 76.0, - "completions/min_terminated_length": 76.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1058.419921875, + "completions/mean_terminated_length": 1003.3299560546875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.9828454382521123, - "grad_norm": 2.482257127761841, - "kl": 7.109375, - "learning_rate": 1.0083036444862492e-07, - "loss": 0.4532, - "num_tokens": 1525047151.0, - "reward": 1.89208984375, - "reward_std": 0.4856886863708496, - "rewards/accuracy_reward/mean": 0.08984375, - "rewards/accuracy_reward/std": 0.2862374484539032, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.1838454306125641, + "grad_norm": 2.9348325729370117, + "kl": 4.3515625, + "learning_rate": 1.0083099439266465e-07, + "loss": 0.2216, + "num_tokens": 1664203789.0, + "reward": 1.46923828125, + "reward_std": 0.7266461849212646, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.51953125, + "rewards/format_reward/std": 0.5001069903373718, + "rewards/tag_count_reward/mean": 0.85400390625, + "rewards/tag_count_reward/std": 0.2423962652683258, "step": 2879 }, { @@ -83506,27 +83506,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 754.173828125, - "completions/mean_terminated_length": 715.1246948242188, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1095.396484375, + "completions/mean_terminated_length": 1042.364990234375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, "epoch": 0.9831868225655032, - "grad_norm": 1.0496032238006592, - "kl": 5.890625, - "learning_rate": 1.0079812991998291e-07, - "loss": 0.406, - "num_tokens": 1525501608.0, - "reward": 1.8701171875, - "reward_std": 0.49785301089286804, - "rewards/accuracy_reward/mean": 0.06854838877916336, - "rewards/accuracy_reward/std": 0.25293970108032227, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.18859504163265228, + "grad_norm": 1.8103909492492676, + "kl": 4.58203125, + "learning_rate": 1.0079873541700307e-07, + "loss": 0.2265, + "num_tokens": 1664832952.0, + "reward": 1.380859375, + "reward_std": 0.7209969162940979, + "rewards/accuracy_reward/mean": 0.058467742055654526, + "rewards/accuracy_reward/std": 0.23486268520355225, + "rewards/format_reward/mean": 0.490234375, + "rewards/format_reward/std": 0.5003935098648071, + "rewards/tag_count_reward/mean": 0.833984375, + "rewards/tag_count_reward/std": 0.25321289896965027, "step": 2880 }, { @@ -83535,27 +83535,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.021484375, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1853.0, - "completions/mean_length": 732.9921875, - "completions/mean_terminated_length": 704.1197509765625, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1063.86328125, + "completions/mean_terminated_length": 1019.677490234375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.9835282068788939, - "grad_norm": 1.1123335361480713, - "kl": 4.94140625, - "learning_rate": 1.0076653295123537e-07, - "loss": 0.2844, - "num_tokens": 1525959044.0, - "reward": 1.95166015625, - "reward_std": 0.4679659307003021, - "rewards/accuracy_reward/mean": 0.130859375, - "rewards/accuracy_reward/std": 0.33757632970809937, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.94580078125, - "rewards/tag_count_reward/std": 0.16438506543636322, + "grad_norm": 1.8105005025863647, + "kl": 4.3203125, + "learning_rate": 1.0076711448420045e-07, + "loss": 0.1692, + "num_tokens": 1665459794.0, + "reward": 1.46533203125, + "reward_std": 0.6987200379371643, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.505859375, + "rewards/format_reward/std": 0.5004546642303467, + "rewards/tag_count_reward/mean": 0.85400390625, + "rewards/tag_count_reward/std": 0.2434033453464508, "step": 2881 }, { @@ -83564,27 +83564,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.048828125, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1973.0, - "completions/mean_length": 806.01953125, - "completions/mean_terminated_length": 742.2628784179688, - "completions/min_length": 53.0, - "completions/min_terminated_length": 53.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1126.685546875, + "completions/mean_terminated_length": 1057.00634765625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.9838695911922847, - "grad_norm": 1.9855237007141113, - "kl": 7.71875, - "learning_rate": 1.0073557358722834e-07, - "loss": 0.4995, - "num_tokens": 1526445614.0, - "reward": 1.85986328125, - "reward_std": 0.5459388494491577, - "rewards/accuracy_reward/mean": 0.06640625, - "rewards/accuracy_reward/std": 0.2492343932390213, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.92626953125, - "rewards/tag_count_reward/std": 0.20165254175662994, + "grad_norm": 1.7949726581573486, + "kl": 4.37890625, + "learning_rate": 1.0073613163917094e-07, + "loss": 0.2072, + "num_tokens": 1666110545.0, + "reward": 1.43603515625, + "reward_std": 0.710891842842102, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.51171875, + "rewards/format_reward/std": 0.5003514885902405, + "rewards/tag_count_reward/mean": 0.84228515625, + "rewards/tag_count_reward/std": 0.25681331753730774, "step": 2882 }, { @@ -83593,27 +83593,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2037.0, - "completions/mean_length": 750.86328125, - "completions/mean_terminated_length": 700.8721923828125, - "completions/min_length": 4.0, - "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1081.509765625, + "completions/mean_terminated_length": 1012.7635498046875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, "epoch": 0.9842109755056755, - "grad_norm": 1.1459872722625732, - "kl": 5.82421875, - "learning_rate": 1.0070525187190301e-07, - "loss": 0.3703, - "num_tokens": 1526912424.0, - "reward": 1.828125, - "reward_std": 0.5574748516082764, - "rewards/accuracy_reward/mean": 0.060483869165182114, - "rewards/accuracy_reward/std": 0.2386218160390854, - "rewards/format_reward/mean": 0.84375, - "rewards/format_reward/std": 0.36344730854034424, - "rewards/tag_count_reward/mean": 0.92578125, - "rewards/tag_count_reward/std": 0.1840227097272873, + "grad_norm": 2.2907166481018066, + "kl": 4.96875, + "learning_rate": 1.0070578692592245e-07, + "loss": 0.2262, + "num_tokens": 1666746646.0, + "reward": 1.44482421875, + "reward_std": 0.7225590944290161, + "rewards/accuracy_reward/mean": 0.08467742055654526, + "rewards/accuracy_reward/std": 0.278682142496109, + "rewards/format_reward/mean": 0.517578125, + "rewards/format_reward/std": 0.5001795887947083, + "rewards/tag_count_reward/mean": 0.84521484375, + "rewards/tag_count_reward/std": 0.24993453919887543, "step": 2883 }, { @@ -83622,27 +83622,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1881.0, - "completions/mean_length": 776.88671875, - "completions/mean_terminated_length": 733.2323608398438, - "completions/min_length": 99.0, - "completions/min_terminated_length": 99.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1183.615234375, + "completions/mean_terminated_length": 1087.9891357421875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, "epoch": 0.9845523598190663, - "grad_norm": 1.5667763948440552, - "kl": 6.0234375, - "learning_rate": 1.0067556784829557e-07, - "loss": 0.4198, - "num_tokens": 1527396206.0, - "reward": 1.87841796875, - "reward_std": 0.5296695828437805, + "grad_norm": 2.2633111476898193, + "kl": 4.625, + "learning_rate": 1.0067608038755635e-07, + "loss": 0.2051, + "num_tokens": 1667438673.0, + "reward": 1.44091796875, + "reward_std": 0.7243264317512512, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.18643119931221008, + "rewards/format_reward/mean": 0.505859375, + "rewards/format_reward/std": 0.5004546642303467, + "rewards/tag_count_reward/mean": 0.84130859375, + "rewards/tag_count_reward/std": 0.2562098503112793, "step": 2884 }, { @@ -83651,27 +83651,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1994.0, - "completions/mean_length": 764.12890625, - "completions/mean_terminated_length": 714.6490478515625, - "completions/min_length": 92.0, - "completions/min_terminated_length": 92.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1152.09765625, + "completions/mean_terminated_length": 1042.0745849609375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.9848937441324571, - "grad_norm": 1.4942023754119873, - "kl": 7.328125, - "learning_rate": 1.0064652155853695e-07, - "loss": 0.5038, - "num_tokens": 1527870544.0, - "reward": 1.8798828125, - "reward_std": 0.5620890855789185, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.85546875, - "rewards/format_reward/std": 0.35197147727012634, - "rewards/tag_count_reward/mean": 0.9287109375, - "rewards/tag_count_reward/std": 0.1929427534341812, + "grad_norm": 2.941580295562744, + "kl": 5.625, + "learning_rate": 1.0064701206626763e-07, + "loss": 0.2918, + "num_tokens": 1668111651.0, + "reward": 1.40673828125, + "reward_std": 0.7130698561668396, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.48828125, + "rewards/format_reward/std": 0.5003514885902405, + "rewards/tag_count_reward/mean": 0.81884765625, + "rewards/tag_count_reward/std": 0.26081544160842896, "step": 2885 }, { @@ -83680,27 +83680,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 756.03125, - "completions/mean_terminated_length": 703.5121459960938, - "completions/min_length": 95.0, - "completions/min_terminated_length": 95.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1122.66796875, + "completions/mean_terminated_length": 1058.9185791015625, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, "epoch": 0.9852351284458479, - "grad_norm": 1.0311920642852783, - "kl": 6.7734375, - "learning_rate": 1.0061811304385314e-07, - "loss": 0.4419, - "num_tokens": 1528338144.0, - "reward": 1.83447265625, - "reward_std": 0.494672030210495, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.93017578125, - "rewards/tag_count_reward/std": 0.1820801943540573, + "grad_norm": 3.773594856262207, + "kl": 4.875, + "learning_rate": 1.0061858200334487e-07, + "loss": 0.2015, + "num_tokens": 1668766969.0, + "reward": 1.4404296875, + "reward_std": 0.7245013117790222, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.5234375, + "rewards/format_reward/std": 0.49993884563446045, + "rewards/tag_count_reward/mean": 0.8505859375, + "rewards/tag_count_reward/std": 0.2505173683166504, "step": 2886 }, { @@ -83709,27 +83709,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1803.0, - "completions/mean_length": 758.474609375, - "completions/mean_terminated_length": 706.0548706054688, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1112.455078125, + "completions/mean_terminated_length": 1041.6995849609375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.9855765127592387, - "grad_norm": 1.2463688850402832, - "kl": 6.3359375, - "learning_rate": 1.0059034234456476e-07, - "loss": 0.3977, - "num_tokens": 1528801571.0, - "reward": 1.857421875, - "reward_std": 0.5183749198913574, - "rewards/accuracy_reward/mean": 0.0546875, - "rewards/accuracy_reward/std": 0.2275916188955307, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93359375, - "rewards/tag_count_reward/std": 0.19655638933181763, + "grad_norm": 1.5365434885025024, + "kl": 5.15625, + "learning_rate": 1.0059079023916987e-07, + "loss": 0.241, + "num_tokens": 1669411634.0, + "reward": 1.447265625, + "reward_std": 0.7264338731765747, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.4982847273349762, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.24330566823482513, "step": 2887 }, { @@ -83738,27 +83738,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1979.0, - "completions/mean_length": 697.787109375, - "completions/mean_terminated_length": 665.3820190429688, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1021.29296875, + "completions/mean_terminated_length": 964.1361083984375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.9859178970726296, - "grad_norm": 1.701975703239441, - "kl": 5.19140625, - "learning_rate": 1.0056320950008728e-07, - "loss": 0.3726, - "num_tokens": 1529236070.0, - "reward": 1.99853515625, - "reward_std": 0.4950244128704071, - "rewards/accuracy_reward/mean": 0.146484375, - "rewards/accuracy_reward/std": 0.35393697023391724, - "rewards/format_reward/mean": 0.90234375, - "rewards/format_reward/std": 0.29713961482048035, - "rewards/tag_count_reward/mean": 0.94970703125, - "rewards/tag_count_reward/std": 0.16339389979839325, + "grad_norm": 3.0063259601593018, + "kl": 4.765625, + "learning_rate": 1.0056363681321794e-07, + "loss": 0.2098, + "num_tokens": 1670011768.0, + "reward": 1.4716796875, + "reward_std": 0.7069277763366699, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.494140625, + "rewards/format_reward/std": 0.5004546642303467, + "rewards/tag_count_reward/mean": 0.8662109375, + "rewards/tag_count_reward/std": 0.22539570927619934, "step": 2888 }, { @@ -83767,27 +83767,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1956.0, - "completions/mean_length": 703.5390625, - "completions/mean_terminated_length": 679.4830932617188, - "completions/min_length": 105.0, - "completions/min_terminated_length": 105.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1055.076171875, + "completions/mean_terminated_length": 988.8812866210938, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.9862592813860203, - "grad_norm": 1.230083703994751, - "kl": 5.7109375, - "learning_rate": 1.0053671454893084e-07, - "loss": 0.3703, - "num_tokens": 1529678954.0, - "reward": 1.86181640625, - "reward_std": 0.4793586730957031, - "rewards/accuracy_reward/mean": 0.044921875, - "rewards/accuracy_reward/std": 0.20733514428138733, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.94189453125, - "rewards/tag_count_reward/std": 0.17533229291439056, + "grad_norm": 2.745061159133911, + "kl": 5.0703125, + "learning_rate": 1.0053712176405765e-07, + "loss": 0.2317, + "num_tokens": 1670634639.0, + "reward": 1.36376953125, + "reward_std": 0.707394003868103, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.466796875, + "rewards/format_reward/std": 0.4993842542171478, + "rewards/tag_count_reward/mean": 0.82666015625, + "rewards/tag_count_reward/std": 0.25433072447776794, "step": 2889 }, { @@ -83796,27 +83796,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.076171875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 769.111328125, - "completions/mean_terminated_length": 730.5130615234375, - "completions/min_length": 83.0, - "completions/min_terminated_length": 83.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1114.408203125, + "completions/mean_terminated_length": 1037.4312744140625, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.9866006656994111, - "grad_norm": 1.0111032724380493, - "kl": 5.9921875, - "learning_rate": 1.0051085752870009e-07, - "loss": 0.3956, - "num_tokens": 1530148531.0, - "reward": 1.89794921875, - "reward_std": 0.4716569185256958, - "rewards/accuracy_reward/mean": 0.07421875, - "rewards/accuracy_reward/std": 0.2623828947544098, - "rewards/format_reward/mean": 0.880859375, - "rewards/format_reward/std": 0.32427072525024414, - "rewards/tag_count_reward/mean": 0.94287109375, - "rewards/tag_count_reward/std": 0.1749558448791504, + "grad_norm": 1.5224727392196655, + "kl": 4.16015625, + "learning_rate": 1.0051124512935078e-07, + "loss": 0.1663, + "num_tokens": 1671281008.0, + "reward": 1.3974609375, + "reward_std": 0.7200717926025391, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.474609375, + "rewards/format_reward/std": 0.4998432695865631, + "rewards/tag_count_reward/mean": 0.8408203125, + "rewards/tag_count_reward/std": 0.24690592288970947, "step": 2890 }, { @@ -83825,27 +83825,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1983.0, - "completions/mean_length": 753.181640625, - "completions/mean_terminated_length": 719.4489135742188, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1123.869140625, + "completions/mean_terminated_length": 1051.8841552734375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.9869420500128019, - "grad_norm": 1.263071894645691, - "kl": 6.453125, - "learning_rate": 1.0048563847609443e-07, - "loss": 0.4032, - "num_tokens": 1530612112.0, - "reward": 1.93896484375, - "reward_std": 0.5125135183334351, + "grad_norm": 1.4512288570404053, + "kl": 4.5078125, + "learning_rate": 1.0048600694585238e-07, + "loss": 0.2019, + "num_tokens": 1671934381.0, + "reward": 1.462890625, + "reward_std": 0.7399915456771851, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.93896484375, - "rewards/tag_count_reward/std": 0.1784919947385788, + "rewards/format_reward/mean": 0.51171875, + "rewards/format_reward/std": 0.5003514885902405, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.25592297315597534, "step": 2891 }, { @@ -83854,27 +83854,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 825.1875, - "completions/mean_terminated_length": 778.0608520507812, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1145.53515625, + "completions/mean_terminated_length": 1087.3721923828125, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, "epoch": 0.9872834343261927, - "grad_norm": 1.4755581617355347, - "kl": 6.1953125, - "learning_rate": 1.0046105742690761e-07, - "loss": 0.3688, - "num_tokens": 1531113040.0, - "reward": 1.87890625, - "reward_std": 0.4649357497692108, - "rewards/accuracy_reward/mean": 0.058467742055654526, - "rewards/accuracy_reward/std": 0.23486268520355225, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.17900052666664124, + "grad_norm": 4.025757789611816, + "kl": 4.54296875, + "learning_rate": 1.0046140724941062e-07, + "loss": 0.1532, + "num_tokens": 1672599327.0, + "reward": 1.36376953125, + "reward_std": 0.6686165928840637, + "rewards/accuracy_reward/mean": 0.04032257944345474, + "rewards/accuracy_reward/std": 0.19691328704357147, + "rewards/format_reward/mean": 0.482421875, + "rewards/format_reward/std": 0.5001795887947083, + "rewards/tag_count_reward/mean": 0.84228515625, + "rewards/tag_count_reward/std": 0.24461729824543, "step": 2892 }, { @@ -83883,27 +83883,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1985.0, - "completions/mean_length": 762.21484375, - "completions/mean_terminated_length": 712.6612548828125, - "completions/min_length": 86.0, - "completions/min_terminated_length": 86.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1149.36328125, + "completions/mean_terminated_length": 1091.447021484375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, "epoch": 0.9876248186395835, - "grad_norm": 1.8035228252410889, - "kl": 8.5078125, - "learning_rate": 1.0043711441602797e-07, - "loss": 0.5307, - "num_tokens": 1531586462.0, - "reward": 1.85302734375, - "reward_std": 0.5432873964309692, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.861328125, - "rewards/format_reward/std": 0.34594178199768066, - "rewards/tag_count_reward/mean": 0.93310546875, - "rewards/tag_count_reward/std": 0.1864875704050064, + "grad_norm": 1.9770722389221191, + "kl": 4.10546875, + "learning_rate": 1.0043744607496673e-07, + "loss": 0.1691, + "num_tokens": 1673270969.0, + "reward": 1.47509765625, + "reward_std": 0.7299438714981079, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.529296875, + "rewards/format_reward/std": 0.49962911009788513, + "rewards/tag_count_reward/mean": 0.85205078125, + "rewards/tag_count_reward/std": 0.23814493417739868, "step": 2893 }, { @@ -83912,27 +83912,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0234375, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 752.18359375, - "completions/mean_terminated_length": 721.0840454101562, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1063.9140625, + "completions/mean_terminated_length": 1011.2674560546875, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, "epoch": 0.9879662029529743, - "grad_norm": 1.287606954574585, - "kl": 6.2734375, - "learning_rate": 1.0041380947743828e-07, - "loss": 0.3622, - "num_tokens": 1532047356.0, - "reward": 1.9140625, - "reward_std": 0.5265927314758301, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.87109375, - "rewards/format_reward/std": 0.33542385697364807, - "rewards/tag_count_reward/mean": 0.939453125, - "rewards/tag_count_reward/std": 0.17900052666664124, + "grad_norm": 1.9249495267868042, + "kl": 4.734375, + "learning_rate": 1.0041412345655508e-07, + "loss": 0.208, + "num_tokens": 1673891469.0, + "reward": 1.45263671875, + "reward_std": 0.7657089233398438, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.50390625, + "rewards/format_reward/std": 0.5004737377166748, + "rewards/tag_count_reward/mean": 0.83349609375, + "rewards/tag_count_reward/std": 0.25409770011901855, "step": 2894 }, { @@ -83941,27 +83941,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.048828125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1957.0, - "completions/mean_length": 745.78515625, - "completions/mean_terminated_length": 703.7781982421875, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1075.15625, + "completions/mean_terminated_length": 1025.2156982421875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, "epoch": 0.9883075872663651, - "grad_norm": 1.184757947921753, - "kl": 6.5625, - "learning_rate": 1.0039114264421555e-07, - "loss": 0.4269, - "num_tokens": 1532508270.0, - "reward": 1.94384765625, - "reward_std": 0.5128960609436035, + "grad_norm": 2.2972631454467773, + "kl": 4.29296875, + "learning_rate": 1.0039143942730297e-07, + "loss": 0.1586, + "num_tokens": 1674521021.0, + "reward": 1.42138671875, + "reward_std": 0.7152012586593628, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.94384765625, - "rewards/tag_count_reward/std": 0.16814936697483063, + "rewards/format_reward/mean": 0.4609375, + "rewards/format_reward/std": 0.4989593029022217, + "rewards/tag_count_reward/mean": 0.84521484375, + "rewards/tag_count_reward/std": 0.24499201774597168, "step": 2895 }, { @@ -83972,25 +83972,25 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2029.0, - "completions/mean_length": 739.978515625, - "completions/mean_terminated_length": 667.1608276367188, - "completions/min_length": 68.0, - "completions/min_terminated_length": 68.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1000.658203125, + "completions/mean_terminated_length": 942.3526000976562, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, "epoch": 0.988648971579756, - "grad_norm": 3.383671522140503, - "kl": 11.640625, - "learning_rate": 1.0036911394853133e-07, - "loss": 0.7671, - "num_tokens": 1532957027.0, - "reward": 1.84912109375, - "reward_std": 0.5863605737686157, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.84765625, - "rewards/format_reward/std": 0.35970520973205566, - "rewards/tag_count_reward/mean": 0.91943359375, - "rewards/tag_count_reward/std": 0.20978572964668274, + "grad_norm": 3.8505115509033203, + "kl": 5.3515625, + "learning_rate": 1.0036939401943061e-07, + "loss": 0.2489, + "num_tokens": 1675103246.0, + "reward": 1.388671875, + "reward_std": 0.6745654344558716, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.466796875, + "rewards/format_reward/std": 0.4993842542171478, + "rewards/tag_count_reward/mean": 0.849609375, + "rewards/tag_count_reward/std": 0.24598506093025208, "step": 2896 }, { @@ -83999,27 +83999,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1938.0, - "completions/mean_length": 730.890625, - "completions/mean_terminated_length": 707.3240356445312, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 1066.171875, + "completions/mean_terminated_length": 1019.9918212890625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.9889903558931467, - "grad_norm": 1.3612533807754517, - "kl": 5.4765625, - "learning_rate": 1.0034772342165116e-07, - "loss": 0.3259, - "num_tokens": 1533404635.0, - "reward": 1.93408203125, - "reward_std": 0.4663509726524353, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.94775390625, - "rewards/tag_count_reward/std": 0.16427458822727203, + "grad_norm": 2.734149217605591, + "kl": 4.4296875, + "learning_rate": 1.0034798726425117e-07, + "loss": 0.2073, + "num_tokens": 1675722518.0, + "reward": 1.46044921875, + "reward_std": 0.7051781415939331, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.498046875, + "rewards/format_reward/std": 0.5004851818084717, + "rewards/tag_count_reward/mean": 0.86865234375, + "rewards/tag_count_reward/std": 0.23294830322265625, "step": 2897 }, { @@ -84028,27 +84028,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 807.328125, - "completions/mean_terminated_length": 748.973388671875, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1173.54296875, + "completions/mean_terminated_length": 1087.22314453125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.9893317402065375, - "grad_norm": 1.000133752822876, - "kl": 7.7578125, - "learning_rate": 1.003269710939351e-07, - "loss": 0.5293, - "num_tokens": 1533906179.0, - "reward": 1.82861328125, - "reward_std": 0.46436357498168945, - "rewards/accuracy_reward/mean": 0.015625, - "rewards/accuracy_reward/std": 0.12414088100194931, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.18751464784145355, + "grad_norm": 2.5757877826690674, + "kl": 3.921875, + "learning_rate": 1.0032721919217075e-07, + "loss": 0.154, + "num_tokens": 1676411564.0, + "reward": 1.38427734375, + "reward_std": 0.662421464920044, + "rewards/accuracy_reward/mean": 0.025390625, + "rewards/accuracy_reward/std": 0.15746226906776428, + "rewards/format_reward/mean": 0.50390625, + "rewards/format_reward/std": 0.5004737377166748, + "rewards/tag_count_reward/mean": 0.85498046875, + "rewards/tag_count_reward/std": 0.2464812994003296, "step": 2898 }, { @@ -84057,27 +84057,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03125, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1897.0, - "completions/mean_length": 796.3671875, - "completions/mean_terminated_length": 755.991943359375, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1136.310546875, + "completions/mean_terminated_length": 1069.4150390625, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, "epoch": 0.9896731245199283, - "grad_norm": 1.1438095569610596, - "kl": 7.9609375, - "learning_rate": 1.0030685699483725e-07, - "loss": 0.5112, - "num_tokens": 1534384527.0, - "reward": 1.826171875, - "reward_std": 0.5321215391159058, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.849609375, - "rewards/format_reward/std": 0.35780346393585205, - "rewards/tag_count_reward/mean": 0.927734375, - "rewards/tag_count_reward/std": 0.1880800724029541, + "grad_norm": 3.2525949478149414, + "kl": 4.234375, + "learning_rate": 1.0030708983268821e-07, + "loss": 0.2026, + "num_tokens": 1677063963.0, + "reward": 1.4033203125, + "reward_std": 0.7334499955177307, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.5078125, + "rewards/format_reward/std": 0.5004279017448425, + "rewards/tag_count_reward/mean": 0.8193359375, + "rewards/tag_count_reward/std": 0.26325365900993347, "step": 2899 }, { @@ -84086,27 +84086,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.095703125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 812.654296875, - "completions/mean_terminated_length": 757.1897583007812, - "completions/min_length": 124.0, - "completions/min_terminated_length": 124.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1192.21875, + "completions/mean_terminated_length": 1101.6500244140625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, "epoch": 0.9900145088333191, - "grad_norm": 1.2878631353378296, - "kl": 7.3203125, - "learning_rate": 1.0028738115290592e-07, - "loss": 0.4888, - "num_tokens": 1534874670.0, - "reward": 1.83740234375, - "reward_std": 0.5386607050895691, - "rewards/accuracy_reward/mean": 0.048828125, - "rewards/accuracy_reward/std": 0.2157193273305893, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.20774433016777039, + "grad_norm": 5.6985249519348145, + "kl": 4.06640625, + "learning_rate": 1.0028759921439508e-07, + "loss": 0.2313, + "num_tokens": 1677748443.0, + "reward": 1.43896484375, + "reward_std": 0.743380606174469, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.544921875, + "rewards/format_reward/std": 0.4984649419784546, + "rewards/tag_count_reward/mean": 0.84326171875, + "rewards/tag_count_reward/std": 0.2516452670097351, "step": 2900 }, { @@ -84115,27 +84115,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 842.08984375, - "completions/mean_terminated_length": 800.6748046875, - "completions/min_length": 73.0, - "completions/min_terminated_length": 73.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1215.958984375, + "completions/mean_terminated_length": 1143.5308837890625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, "epoch": 0.9903558931467099, - "grad_norm": 1.9414771795272827, - "kl": 5.9765625, - "learning_rate": 1.0026854359578345e-07, - "loss": 0.4032, - "num_tokens": 1535386572.0, - "reward": 1.8466796875, - "reward_std": 0.47419971227645874, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.876953125, - "rewards/format_reward/std": 0.32881227135658264, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.19250644743442535, + "grad_norm": 2.1337475776672363, + "kl": 4.109375, + "learning_rate": 1.0026874736497576e-07, + "loss": 0.1691, + "num_tokens": 1678451766.0, + "reward": 1.37109375, + "reward_std": 0.6828578114509583, + "rewards/accuracy_reward/mean": 0.044921875, + "rewards/accuracy_reward/std": 0.20733514428138733, + "rewards/format_reward/mean": 0.4921875, + "rewards/format_reward/std": 0.5004279017448425, + "rewards/tag_count_reward/mean": 0.833984375, + "rewards/tag_count_reward/std": 0.24685366451740265, "step": 2901 }, { @@ -84144,27 +84144,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.041015625, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 779.513671875, - "completions/mean_terminated_length": 725.2607421875, - "completions/min_length": 123.0, - "completions/min_terminated_length": 123.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1128.314453125, + "completions/mean_terminated_length": 1077.115478515625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, "epoch": 0.9906972774601007, - "grad_norm": 1.2120583057403564, - "kl": 7.15625, - "learning_rate": 1.0025034435020633e-07, - "loss": 0.4822, - "num_tokens": 1535863731.0, - "reward": 1.83740234375, - "reward_std": 0.5296105742454529, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.853515625, - "rewards/format_reward/std": 0.35393697023391724, - "rewards/tag_count_reward/mean": 0.92333984375, - "rewards/tag_count_reward/std": 0.1999441683292389, + "grad_norm": 3.13985538482666, + "kl": 4.28515625, + "learning_rate": 1.0025053431120732e-07, + "loss": 0.1655, + "num_tokens": 1679107511.0, + "reward": 1.43994140625, + "reward_std": 0.6925660371780396, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2746807038784027, + "rewards/format_reward/mean": 0.50390625, + "rewards/format_reward/std": 0.5004737377166748, + "rewards/tag_count_reward/mean": 0.85400390625, + "rewards/tag_count_reward/std": 0.24390532076358795, "step": 2902 }, { @@ -84173,27 +84173,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.04296875, + "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 838.51953125, - "completions/mean_terminated_length": 784.21630859375, - "completions/min_length": 45.0, - "completions/min_terminated_length": 45.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1146.12890625, + "completions/mean_terminated_length": 1075.8778076171875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, "epoch": 0.9910386617734915, - "grad_norm": 1.3991584777832031, - "kl": 7.125, - "learning_rate": 1.0023278344200509e-07, - "loss": 0.4354, - "num_tokens": 1536377341.0, - "reward": 1.7880859375, - "reward_std": 0.5258431434631348, - "rewards/accuracy_reward/mean": 0.04296875, - "rewards/accuracy_reward/std": 0.2029850035905838, - "rewards/format_reward/mean": 0.8359375, - "rewards/format_reward/std": 0.37069445848464966, - "rewards/tag_count_reward/mean": 0.9091796875, - "rewards/tag_count_reward/std": 0.2245972901582718, + "grad_norm": 3.5519700050354004, + "kl": 3.953125, + "learning_rate": 1.0023296007895951e-07, + "loss": 0.168, + "num_tokens": 1679778617.0, + "reward": 1.41552734375, + "reward_std": 0.7182472348213196, + "rewards/accuracy_reward/mean": 0.052734375, + "rewards/accuracy_reward/std": 0.22372129559516907, + "rewards/format_reward/mean": 0.51171875, + "rewards/format_reward/std": 0.5003514885902405, + "rewards/tag_count_reward/mean": 0.85107421875, + "rewards/tag_count_reward/std": 0.24363486468791962, "step": 2903 }, { @@ -84202,27 +84202,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1929.0, - "completions/mean_length": 720.208984375, - "completions/mean_terminated_length": 696.4512329101562, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1054.701171875, + "completions/mean_terminated_length": 986.2693481445312, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.9913800460868823, - "grad_norm": 2.394346237182617, - "kl": 5.46484375, - "learning_rate": 1.0021586089610422e-07, - "loss": 0.3727, - "num_tokens": 1536825368.0, - "reward": 1.91650390625, - "reward_std": 0.47387444972991943, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.94775390625, - "rewards/tag_count_reward/std": 0.17084409296512604, + "grad_norm": 2.2591214179992676, + "kl": 4.38671875, + "learning_rate": 1.0021602469319456e-07, + "loss": 0.2079, + "num_tokens": 1680397904.0, + "reward": 1.474609375, + "reward_std": 0.6892572641372681, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.544921875, + "rewards/format_reward/std": 0.4984649419784546, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.24179288744926453, "step": 2904 }, { @@ -84231,27 +84231,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.017578125, + "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 725.705078125, - "completions/mean_terminated_length": 702.0457153320312, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1137.544921875, + "completions/mean_terminated_length": 1058.2908935546875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.9917214304002732, - "grad_norm": 2.2166171073913574, - "kl": 4.75, - "learning_rate": 1.0019957673652214e-07, - "loss": 0.3301, - "num_tokens": 1537265905.0, - "reward": 1.876953125, - "reward_std": 0.44129854440689087, - "rewards/accuracy_reward/mean": 0.02734375, - "rewards/accuracy_reward/std": 0.16324250400066376, - "rewards/format_reward/mean": 0.90234375, - "rewards/format_reward/std": 0.29713961482048035, - "rewards/tag_count_reward/mean": 0.947265625, - "rewards/tag_count_reward/std": 0.17318345606327057, + "grad_norm": 2.848632335662842, + "kl": 4.9609375, + "learning_rate": 1.0019972817796753e-07, + "loss": 0.2393, + "num_tokens": 1681049303.0, + "reward": 1.39794921875, + "reward_std": 0.707427978515625, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.509765625, + "rewards/format_reward/std": 0.5003935098648071, + "rewards/tag_count_reward/mean": 0.84130859375, + "rewards/tag_count_reward/std": 0.2562098503112793, "step": 2905 }, { @@ -84260,27 +84260,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 736.302734375, - "completions/mean_terminated_length": 699.4276733398438, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1041.3125, + "completions/mean_terminated_length": 996.1142578125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.9920628147136639, - "grad_norm": 2.6191136837005615, - "kl": 5.703125, - "learning_rate": 1.0018393098637125e-07, - "loss": 0.3778, - "num_tokens": 1537722604.0, - "reward": 1.92822265625, - "reward_std": 0.4573560655117035, - "rewards/accuracy_reward/mean": 0.076171875, - "rewards/accuracy_reward/std": 0.26553234457969666, - "rewards/format_reward/mean": 0.904296875, - "rewards/format_reward/std": 0.2944713830947876, - "rewards/tag_count_reward/mean": 0.94775390625, - "rewards/tag_count_reward/std": 0.1729784905910492, + "grad_norm": 1.3245724439620972, + "kl": 4.35546875, + "learning_rate": 1.001840705564258e-07, + "loss": 0.172, + "num_tokens": 1681662167.0, + "reward": 1.52001953125, + "reward_std": 0.7332930564880371, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.552734375, + "rewards/format_reward/std": 0.4976975917816162, + "rewards/tag_count_reward/mean": 0.85009765625, + "rewards/tag_count_reward/std": 0.25192990899086, "step": 2906 }, { @@ -84289,27 +84289,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.015625, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1907.0, - "completions/mean_length": 709.029296875, - "completions/mean_terminated_length": 687.7758178710938, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1026.3125, + "completions/mean_terminated_length": 958.2000732421875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.9924041990270547, - "grad_norm": 1.049399495124817, - "kl": 4.73046875, - "learning_rate": 1.0016892366785786e-07, - "loss": 0.3121, - "num_tokens": 1538163867.0, - "reward": 1.998046875, - "reward_std": 0.4111006259918213, + "grad_norm": 5.573750019073486, + "kl": 6.0625, + "learning_rate": 1.0016905185080935e-07, + "loss": 0.2786, + "num_tokens": 1682265879.0, + "reward": 1.46044921875, + "reward_std": 0.7700638771057129, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, - "rewards/format_reward/mean": 0.919921875, - "rewards/format_reward/std": 0.271679550409317, - "rewards/tag_count_reward/mean": 0.958984375, - "rewards/tag_count_reward/std": 0.14680999517440796, + "rewards/format_reward/mean": 0.505859375, + "rewards/format_reward/std": 0.5004546642303467, + "rewards/tag_count_reward/mean": 0.83544921875, + "rewards/tag_count_reward/std": 0.26011475920677185, "step": 2907 }, { @@ -84318,27 +84318,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 790.71484375, - "completions/mean_terminated_length": 752.7685546875, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1132.71484375, + "completions/mean_terminated_length": 1046.6624755859375, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, "epoch": 0.9927455833404455, - "grad_norm": 1.9230318069458008, - "kl": 5.796875, - "learning_rate": 1.0015455480228208e-07, - "loss": 0.3848, - "num_tokens": 1538642873.0, - "reward": 1.9091796875, - "reward_std": 0.4895654320716858, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.890625, - "rewards/format_reward/std": 0.31241437792778015, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.18930304050445557, + "grad_norm": 2.484156370162964, + "kl": 5.0234375, + "learning_rate": 1.0015467208245076e-07, + "loss": 0.2461, + "num_tokens": 1682919989.0, + "reward": 1.4169921875, + "reward_std": 0.7475748062133789, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.26553234457969666, + "rewards/format_reward/mean": 0.51171875, + "rewards/format_reward/std": 0.5003514885902405, + "rewards/tag_count_reward/mean": 0.8291015625, + "rewards/tag_count_reward/std": 0.2523718476295471, "step": 2908 }, { @@ -84347,27 +84347,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1964.0, - "completions/mean_length": 747.640625, - "completions/mean_terminated_length": 721.737060546875, - "completions/min_length": 63.0, - "completions/min_terminated_length": 63.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1080.0625, + "completions/mean_terminated_length": 1032.458984375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.9930869676538363, - "grad_norm": 2.3826773166656494, - "kl": 5.78125, - "learning_rate": 1.0014082441003791e-07, - "loss": 0.3571, - "num_tokens": 1539106177.0, - "reward": 1.92529296875, - "reward_std": 0.4951738715171814, - "rewards/accuracy_reward/mean": 0.099609375, - "rewards/accuracy_reward/std": 0.29977133870124817, - "rewards/format_reward/mean": 0.8828125, - "rewards/format_reward/std": 0.32195815443992615, - "rewards/tag_count_reward/mean": 0.94287109375, - "rewards/tag_count_reward/std": 0.17213678359985352, + "grad_norm": 6.493854999542236, + "kl": 5.6953125, + "learning_rate": 1.0014093127177493e-07, + "loss": 0.2422, + "num_tokens": 1683553493.0, + "reward": 1.39404296875, + "reward_std": 0.7597787380218506, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.486328125, + "rewards/format_reward/std": 0.5003018379211426, + "rewards/tag_count_reward/mean": 0.81787109375, + "rewards/tag_count_reward/std": 0.26663437485694885, "step": 2909 }, { @@ -84376,27 +84376,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.029296875, + "completions/clipped_ratio": 0.08203125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 795.3046875, - "completions/mean_terminated_length": 757.4969482421875, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1144.462890625, + "completions/mean_terminated_length": 1063.72119140625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, "epoch": 0.9934283519672271, - "grad_norm": 2.0867769718170166, - "kl": 7.4140625, - "learning_rate": 1.001277325106131e-07, - "loss": 0.4757, - "num_tokens": 1539599325.0, - "reward": 1.89453125, - "reward_std": 0.5042934417724609, - "rewards/accuracy_reward/mean": 0.064453125, - "rewards/accuracy_reward/std": 0.24579854309558868, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.1719430834054947, + "grad_norm": 1.711689829826355, + "kl": 4.6953125, + "learning_rate": 1.0012782943829913e-07, + "loss": 0.2221, + "num_tokens": 1684225410.0, + "reward": 1.49365234375, + "reward_std": 0.7310701608657837, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.537109375, + "rewards/format_reward/std": 0.4991086423397064, + "rewards/tag_count_reward/mean": 0.85888671875, + "rewards/tag_count_reward/std": 0.23561164736747742, "step": 2910 }, { @@ -84405,27 +84405,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1965.0, - "completions/mean_length": 749.224609375, - "completions/mean_terminated_length": 723.3526000976562, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1073.982421875, + "completions/mean_terminated_length": 1030.2509765625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, "epoch": 0.9937697362806179, - "grad_norm": 1.966011881828308, - "kl": 7.46875, - "learning_rate": 1.0011527912258924e-07, - "loss": 0.4478, - "num_tokens": 1540065920.0, - "reward": 1.88134765625, - "reward_std": 0.5172063708305359, - "rewards/accuracy_reward/mean": 0.08870967477560043, - "rewards/accuracy_reward/std": 0.2846112847328186, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.93603515625, - "rewards/tag_count_reward/std": 0.171859011054039, + "grad_norm": 2.9198687076568604, + "kl": 4.81640625, + "learning_rate": 1.0011536660063326e-07, + "loss": 0.21, + "num_tokens": 1684858281.0, + "reward": 1.4697265625, + "reward_std": 0.7562755346298218, + "rewards/accuracy_reward/mean": 0.11491935700178146, + "rewards/accuracy_reward/std": 0.3192465901374817, + "rewards/format_reward/mean": 0.517578125, + "rewards/format_reward/std": 0.5001795887947083, + "rewards/tag_count_reward/mean": 0.8408203125, + "rewards/tag_count_reward/std": 0.24139538407325745, "step": 2911 }, { @@ -84434,27 +84434,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.01953125, + "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1955.0, - "completions/mean_length": 726.9921875, - "completions/mean_terminated_length": 700.6773071289062, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1074.673828125, + "completions/mean_terminated_length": 1020.4887084960938, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.9941111205940087, - "grad_norm": 2.497357130050659, - "kl": 7.30859375, - "learning_rate": 1.0010346426364161e-07, - "loss": 0.4731, - "num_tokens": 1540513356.0, - "reward": 1.927734375, - "reward_std": 0.49232202768325806, - "rewards/accuracy_reward/mean": 0.09765625, - "rewards/accuracy_reward/std": 0.29713961482048035, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.943359375, - "rewards/tag_count_reward/std": 0.1697956770658493, + "grad_norm": 3.068920850753784, + "kl": 5.625, + "learning_rate": 1.0010354277647939e-07, + "loss": 0.2637, + "num_tokens": 1685483730.0, + "reward": 1.48046875, + "reward_std": 0.741915225982666, + "rewards/accuracy_reward/mean": 0.095703125, + "rewards/accuracy_reward/std": 0.2944713830947876, + "rewards/format_reward/mean": 0.533203125, + "rewards/format_reward/std": 0.4993842542171478, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.24014326930046082, "step": 2912 }, { @@ -84463,27 +84463,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 795.17578125, - "completions/mean_terminated_length": 749.5263061523438, - "completions/min_length": 79.0, - "completions/min_terminated_length": 79.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1126.357421875, + "completions/mean_terminated_length": 1066.9583740234375, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, "epoch": 0.9944525049073996, - "grad_norm": 4.3932576179504395, - "kl": 8.859375, - "learning_rate": 1.0009228795053926e-07, - "loss": 0.5182, - "num_tokens": 1540991062.0, - "reward": 1.84814453125, - "reward_std": 0.4987267851829529, - "rewards/accuracy_reward/mean": 0.06854838877916336, - "rewards/accuracy_reward/std": 0.25293970108032227, - "rewards/format_reward/mean": 0.857421875, - "rewards/format_reward/std": 0.3499840497970581, - "rewards/tag_count_reward/mean": 0.92431640625, - "rewards/tag_count_reward/std": 0.1966189742088318, + "grad_norm": 1.6450693607330322, + "kl": 4.46484375, + "learning_rate": 1.0009235798263205e-07, + "loss": 0.1867, + "num_tokens": 1686131001.0, + "reward": 1.423828125, + "reward_std": 0.7515327334403992, + "rewards/accuracy_reward/mean": 0.08467742055654526, + "rewards/accuracy_reward/std": 0.278682142496109, + "rewards/format_reward/mean": 0.505859375, + "rewards/format_reward/std": 0.5004546642303467, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.2535221576690674, "step": 2913 }, { @@ -84492,27 +84492,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.064453125, + "completions/clipped_ratio": 0.07421875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2004.0, - "completions/mean_length": 876.916015625, - "completions/mean_terminated_length": 796.2359008789062, - "completions/min_length": 23.0, - "completions/min_terminated_length": 23.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1158.17578125, + "completions/mean_terminated_length": 1086.839599609375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.9947938892207903, - "grad_norm": 5.697507858276367, - "kl": 11.0703125, - "learning_rate": 1.0008175019914494e-07, - "loss": 0.6048, - "num_tokens": 1541517451.0, - "reward": 1.7763671875, - "reward_std": 0.6395107507705688, - "rewards/accuracy_reward/mean": 0.08203125, - "rewards/accuracy_reward/std": 0.2746807038784027, - "rewards/format_reward/mean": 0.802734375, - "rewards/format_reward/std": 0.3983237147331238, - "rewards/tag_count_reward/mean": 0.8916015625, - "rewards/tag_count_reward/std": 0.2345370054244995, + "grad_norm": 3.201535940170288, + "kl": 5.1484375, + "learning_rate": 1.00081812234978e-07, + "loss": 0.2303, + "num_tokens": 1686801395.0, + "reward": 1.43017578125, + "reward_std": 0.7791985273361206, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.498046875, + "rewards/format_reward/std": 0.5004851818084717, + "rewards/tag_count_reward/mean": 0.81494140625, + "rewards/tag_count_reward/std": 0.2636793553829193, "step": 2914 }, { @@ -84521,27 +84521,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.02734375, + "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 797.865234375, - "completions/mean_terminated_length": 762.7208251953125, - "completions/min_length": 83.0, - "completions/min_terminated_length": 83.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1126.201171875, + "completions/mean_terminated_length": 1060.6339111328125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, "epoch": 0.9951352735341811, - "grad_norm": 1.3660556077957153, - "kl": 7.203125, - "learning_rate": 1.0007185102441505e-07, - "loss": 0.4109, - "num_tokens": 1541997958.0, - "reward": 1.8818359375, - "reward_std": 0.5130884647369385, - "rewards/accuracy_reward/mean": 0.068359375, - "rewards/accuracy_reward/std": 0.25260838866233826, - "rewards/format_reward/mean": 0.875, - "rewards/format_reward/std": 0.3310423493385315, - "rewards/tag_count_reward/mean": 0.9384765625, - "rewards/tag_count_reward/std": 0.1779806911945343, + "grad_norm": 3.7891156673431396, + "kl": 4.2421875, + "learning_rate": 1.0007190554849646e-07, + "loss": 0.1891, + "num_tokens": 1687450010.0, + "reward": 1.51318359375, + "reward_std": 0.7274699211120605, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.568359375, + "rewards/format_reward/std": 0.4957893490791321, + "rewards/tag_count_reward/mean": 0.86083984375, + "rewards/tag_count_reward/std": 0.23625560104846954, "step": 2915 }, { @@ -84550,27 +84550,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 896.453125, - "completions/mean_terminated_length": 819.683349609375, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1238.1640625, + "completions/mean_terminated_length": 1162.0257568359375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.9954766578475719, - "grad_norm": 1.6234740018844604, - "kl": 7.8984375, - "learning_rate": 1.0006259044039964e-07, - "loss": 0.5138, - "num_tokens": 1542542478.0, - "reward": 1.88037109375, - "reward_std": 0.5088456869125366, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.87890625, - "rewards/format_reward/std": 0.3265552520751953, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.20027364790439606, + "grad_norm": 3.015854835510254, + "kl": 4.83984375, + "learning_rate": 1.0006263793725872e-07, + "loss": 0.2123, + "num_tokens": 1688169486.0, + "reward": 1.4375, + "reward_std": 0.7634412050247192, + "rewards/accuracy_reward/mean": 0.083984375, + "rewards/accuracy_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.541015625, + "rewards/format_reward/std": 0.49880221486091614, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.26726123690605164, "step": 2916 }, { @@ -84579,27 +84579,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.044921875, + "completions/clipped_ratio": 0.060546875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1997.0, - "completions/mean_length": 780.3984375, - "completions/mean_terminated_length": 720.777099609375, - "completions/min_length": 96.0, - "completions/min_terminated_length": 96.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1082.146484375, + "completions/mean_terminated_length": 1019.8981323242188, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, "epoch": 0.9958180421609627, - "grad_norm": 1.4404579401016235, - "kl": 8.1015625, - "learning_rate": 1.000539684602424e-07, - "loss": 0.5381, - "num_tokens": 1543018458.0, - "reward": 1.8310546875, - "reward_std": 0.5409401655197144, - "rewards/accuracy_reward/mean": 0.052734375, - "rewards/accuracy_reward/std": 0.22372129559516907, - "rewards/format_reward/mean": 0.86328125, - "rewards/format_reward/std": 0.3438861668109894, - "rewards/tag_count_reward/mean": 0.9150390625, - "rewards/tag_count_reward/std": 0.2163981944322586, + "grad_norm": 4.808913230895996, + "kl": 4.3046875, + "learning_rate": 1.0005400941442848e-07, + "loss": 0.1963, + "num_tokens": 1688799961.0, + "reward": 1.5009765625, + "reward_std": 0.7337645292282104, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.58984375, + "rewards/format_reward/std": 0.49234291911125183, + "rewards/tag_count_reward/mean": 0.8369140625, + "rewards/tag_count_reward/std": 0.2640654742717743, "step": 2917 }, { @@ -84608,27 +84608,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 798.650390625, - "completions/mean_terminated_length": 747.86376953125, - "completions/min_length": 81.0, - "completions/min_terminated_length": 81.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1098.517578125, + "completions/mean_terminated_length": 1035.21875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, "epoch": 0.9961594264743535, - "grad_norm": 1.7061262130737305, - "kl": 6.640625, - "learning_rate": 1.0004598509618068e-07, - "loss": 0.4761, - "num_tokens": 1543498935.0, - "reward": 1.8583984375, - "reward_std": 0.47106894850730896, - "rewards/accuracy_reward/mean": 0.033203125, - "rewards/accuracy_reward/std": 0.17934183776378632, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.9404296875, - "rewards/tag_count_reward/std": 0.18337510526180267, + "grad_norm": 5.332459926605225, + "kl": 3.953125, + "learning_rate": 1.0004601999226164e-07, + "loss": 0.1689, + "num_tokens": 1689433970.0, + "reward": 1.45751953125, + "reward_std": 0.72110515832901, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.548828125, + "rewards/format_reward/std": 0.498096764087677, + "rewards/tag_count_reward/mean": 0.84228515625, + "rewards/tag_count_reward/std": 0.24956724047660828, "step": 2918 }, { @@ -84637,27 +84637,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.046875, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 861.73046875, - "completions/mean_terminated_length": 803.3892822265625, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1160.478515625, + "completions/mean_terminated_length": 1085.264892578125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.9965008107877443, - "grad_norm": 2.162806510925293, - "kl": 7.6875, - "learning_rate": 1.0003864035954539e-07, - "loss": 0.455, - "num_tokens": 1544016797.0, - "reward": 1.91650390625, - "reward_std": 0.570501446723938, - "rewards/accuracy_reward/mean": 0.13671875, - "rewards/accuracy_reward/std": 0.3438861668109894, - "rewards/format_reward/mean": 0.859375, - "rewards/format_reward/std": 0.3479743003845215, - "rewards/tag_count_reward/mean": 0.92041015625, - "rewards/tag_count_reward/std": 0.2078179121017456, + "grad_norm": 3.012641429901123, + "kl": 4.609375, + "learning_rate": 1.0003866968210636e-07, + "loss": 0.2057, + "num_tokens": 1690104791.0, + "reward": 1.53515625, + "reward_std": 0.7460612058639526, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.56640625, + "rewards/format_reward/std": 0.4960552453994751, + "rewards/tag_count_reward/mean": 0.853515625, + "rewards/tag_count_reward/std": 0.23673690855503082, "step": 2919 }, { @@ -84666,27 +84666,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.037109375, + "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 819.791015625, - "completions/mean_terminated_length": 772.4563598632812, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1164.921875, + "completions/mean_terminated_length": 1090.084716796875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, "epoch": 0.9968421951011351, - "grad_norm": 1.647328495979309, - "kl": 5.90625, - "learning_rate": 1.0003193426076107e-07, - "loss": 0.3672, - "num_tokens": 1544526674.0, - "reward": 1.89013671875, - "reward_std": 0.524093747138977, - "rewards/accuracy_reward/mean": 0.095703125, - "rewards/accuracy_reward/std": 0.2944713830947876, - "rewards/format_reward/mean": 0.865234375, - "rewards/format_reward/std": 0.3418070077896118, - "rewards/tag_count_reward/mean": 0.92919921875, - "rewards/tag_count_reward/std": 0.19280590116977692, + "grad_norm": 2.2774465084075928, + "kl": 4.9296875, + "learning_rate": 1.0003195849440295e-07, + "loss": 0.2602, + "num_tokens": 1690791375.0, + "reward": 1.416015625, + "reward_std": 0.7490825653076172, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.490234375, + "rewards/format_reward/std": 0.5003935098648071, + "rewards/tag_count_reward/mean": 0.826171875, + "rewards/tag_count_reward/std": 0.26041626930236816, "step": 2920 }, { @@ -84695,27 +84695,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0625, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1975.0, - "completions/mean_length": 832.6796875, - "completions/mean_terminated_length": 751.6583862304688, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1138.990234375, + "completions/mean_terminated_length": 1076.3653564453125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, "epoch": 0.997183579414526, - "grad_norm": 1.7485300302505493, - "kl": 8.65625, - "learning_rate": 1.0002586680934577e-07, - "loss": 0.5546, - "num_tokens": 1545028510.0, - "reward": 1.80126953125, - "reward_std": 0.6211615800857544, - "rewards/accuracy_reward/mean": 0.09375, - "rewards/accuracy_reward/std": 0.29176566004753113, - "rewards/format_reward/mean": 0.8046875, - "rewards/format_reward/std": 0.3968288004398346, - "rewards/tag_count_reward/mean": 0.90283203125, - "rewards/tag_count_reward/std": 0.22384031116962433, + "grad_norm": 1.4758949279785156, + "kl": 4.5078125, + "learning_rate": 1.0002588643868397e-07, + "loss": 0.2073, + "num_tokens": 1691450042.0, + "reward": 1.4814453125, + "reward_std": 0.8069281578063965, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.541015625, + "rewards/format_reward/std": 0.49880221486091614, + "rewards/tag_count_reward/mean": 0.8193359375, + "rewards/tag_count_reward/std": 0.2664860486984253, "step": 2921 }, { @@ -84724,27 +84724,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.033203125, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 821.447265625, - "completions/mean_terminated_length": 779.3232421875, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1090.2578125, + "completions/mean_terminated_length": 1039.0205078125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.9975249637279167, - "grad_norm": 1.8131388425827026, - "kl": 4.9453125, - "learning_rate": 1.0002043801391112e-07, - "loss": 0.3404, - "num_tokens": 1545526915.0, - "reward": 1.8935546875, - "reward_std": 0.46334606409072876, - "rewards/accuracy_reward/mean": 0.072265625, - "rewards/accuracy_reward/std": 0.2591804563999176, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, - "rewards/tag_count_reward/mean": 0.9365234375, - "rewards/tag_count_reward/std": 0.18800638616085052, + "grad_norm": 3.482325553894043, + "kl": 4.80078125, + "learning_rate": 1.0002045352357407e-07, + "loss": 0.1939, + "num_tokens": 1692086078.0, + "reward": 1.43798828125, + "reward_std": 0.7056668996810913, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.521484375, + "rewards/format_reward/std": 0.5000267624855042, + "rewards/tag_count_reward/mean": 0.83837890625, + "rewards/tag_count_reward/std": 0.25388702750205994, "step": 2922 }, { @@ -84753,27 +84753,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.025390625, + "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1934.0, - "completions/mean_length": 727.15234375, - "completions/mean_terminated_length": 692.7415161132812, - "completions/min_length": 71.0, - "completions/min_terminated_length": 71.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1032.2109375, + "completions/mean_terminated_length": 990.9186401367188, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.9978663480413075, - "grad_norm": 1.924739956855774, - "kl": 4.8359375, - "learning_rate": 1.0001564788216237e-07, - "loss": 0.3474, - "num_tokens": 1545969441.0, - "reward": 1.9033203125, - "reward_std": 0.42516008019447327, - "rewards/accuracy_reward/mean": 0.060546875, - "rewards/accuracy_reward/std": 0.2387305200099945, - "rewards/format_reward/mean": 0.89453125, - "rewards/format_reward/std": 0.3074568510055542, - "rewards/tag_count_reward/mean": 0.9482421875, - "rewards/tag_count_reward/std": 0.1670125275850296, + "grad_norm": 3.0966458320617676, + "kl": 4.8515625, + "learning_rate": 1.000156597567902e-07, + "loss": 0.1938, + "num_tokens": 1692684794.0, + "reward": 1.4345703125, + "reward_std": 0.6710656881332397, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.525390625, + "rewards/format_reward/std": 0.4998432695865631, + "rewards/tag_count_reward/mean": 0.8525390625, + "rewards/tag_count_reward/std": 0.2427690029144287, "step": 2923 }, { @@ -84782,27 +84782,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.03515625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1925.0, - "completions/mean_length": 788.767578125, - "completions/mean_terminated_length": 742.8846435546875, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1119.09765625, + "completions/mean_terminated_length": 1069.4033203125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.9982077323546983, - "grad_norm": 1.661131501197815, - "kl": 4.83203125, - "learning_rate": 1.0001149642089817e-07, - "loss": 0.3272, - "num_tokens": 1546444058.0, - "reward": 1.916015625, - "reward_std": 0.48726630210876465, - "rewards/accuracy_reward/mean": 0.0859375, - "rewards/accuracy_reward/std": 0.28054583072662354, - "rewards/format_reward/mean": 0.888671875, - "rewards/format_reward/std": 0.31484565138816833, - "rewards/tag_count_reward/mean": 0.94140625, - "rewards/tag_count_reward/std": 0.17270830273628235, + "grad_norm": 3.7675349712371826, + "kl": 4.26171875, + "learning_rate": 1.000115051451414e-07, + "loss": 0.1967, + "num_tokens": 1693328540.0, + "reward": 1.52392578125, + "reward_std": 0.7720863223075867, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.55078125, + "rewards/format_reward/std": 0.497901052236557, + "rewards/tag_count_reward/mean": 0.84619140625, + "rewards/tag_count_reward/std": 0.24510899186134338, "step": 2924 }, { @@ -84811,27 +84811,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.056640625, + "completions/clipped_ratio": 0.068359375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 869.40234375, - "completions/mean_terminated_length": 798.6376953125, - "completions/min_length": 76.0, - "completions/min_terminated_length": 76.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1150.630859375, + "completions/mean_terminated_length": 1084.7861328125, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, "epoch": 0.9985491166680891, - "grad_norm": 2.2363667488098145, - "kl": 5.9765625, - "learning_rate": 1.0000798363601074e-07, - "loss": 0.431, - "num_tokens": 1546969224.0, - "reward": 1.90380859375, - "reward_std": 0.5548034310340881, - "rewards/accuracy_reward/mean": 0.103515625, - "rewards/accuracy_reward/std": 0.30492907762527466, - "rewards/format_reward/mean": 0.869140625, - "rewards/format_reward/std": 0.33757632970809937, - "rewards/tag_count_reward/mean": 0.93115234375, - "rewards/tag_count_reward/std": 0.19351330399513245, + "grad_norm": 2.030956506729126, + "kl": 4.17578125, + "learning_rate": 1.0000798969452879e-07, + "loss": 0.166, + "num_tokens": 1693997695.0, + "reward": 1.52294921875, + "reward_std": 0.7646929025650024, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.572265625, + "rewards/format_reward/std": 0.4952339828014374, + "rewards/tag_count_reward/mean": 0.84326171875, + "rewards/tag_count_reward/std": 0.25261548161506653, "step": 2925 }, { @@ -84840,27 +84840,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2015.0, - "completions/mean_length": 755.484375, - "completions/mean_terminated_length": 702.9430541992188, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1045.6171875, + "completions/mean_terminated_length": 991.9917602539062, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.9988905009814799, - "grad_norm": 3.301025867462158, - "kl": 5.73828125, - "learning_rate": 1.0000510953248593e-07, - "loss": 0.4159, - "num_tokens": 1547430640.0, - "reward": 1.8583984375, - "reward_std": 0.5075932741165161, - "rewards/accuracy_reward/mean": 0.05859375, - "rewards/accuracy_reward/std": 0.23509246110916138, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.9326171875, - "rewards/tag_count_reward/std": 0.1879453957080841, + "grad_norm": 2.6360840797424316, + "kl": 5.2109375, + "learning_rate": 1.0000511340994572e-07, + "loss": 0.2644, + "num_tokens": 1694607659.0, + "reward": 1.3779296875, + "reward_std": 0.7198290824890137, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5004889965057373, + "rewards/tag_count_reward/mean": 0.8310546875, + "rewards/tag_count_reward/std": 0.2608177363872528, "step": 2926 }, { @@ -84869,27 +84869,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.0390625, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1990.0, - "completions/mean_length": 871.13671875, - "completions/mean_terminated_length": 823.2966918945312, - "completions/min_length": 80.0, - "completions/min_terminated_length": 80.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1169.10546875, + "completions/mean_terminated_length": 1108.5552978515625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.9992318852948707, - "grad_norm": 2.2923011779785156, - "kl": 6.11328125, - "learning_rate": 1.0000287411440292e-07, - "loss": 0.4166, - "num_tokens": 1547950582.0, - "reward": 1.826171875, - "reward_std": 0.47404032945632935, - "rewards/accuracy_reward/mean": 0.0234375, - "rewards/accuracy_reward/std": 0.15143637359142303, - "rewards/format_reward/mean": 0.8671875, - "rewards/format_reward/std": 0.33970388770103455, - "rewards/tag_count_reward/mean": 0.935546875, - "rewards/tag_count_reward/std": 0.18238498270511627, + "grad_norm": 3.7356173992156982, + "kl": 3.8125, + "learning_rate": 1.0000287629547764e-07, + "loss": 0.181, + "num_tokens": 1695280161.0, + "reward": 1.4658203125, + "reward_std": 0.7172948122024536, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23509246110916138, + "rewards/format_reward/mean": 0.560546875, + "rewards/format_reward/std": 0.49680593609809875, + "rewards/tag_count_reward/mean": 0.8466796875, + "rewards/tag_count_reward/std": 0.25253841280937195, "step": 2927 }, { @@ -84898,27 +84898,27 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.05078125, + "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 837.9765625, - "completions/mean_terminated_length": 773.2427978515625, - "completions/min_length": 93.0, - "completions/min_terminated_length": 93.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1133.03515625, + "completions/mean_terminated_length": 1070.0, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.9995732696082615, - "grad_norm": 0.9258942604064941, - "kl": 8.15625, - "learning_rate": 1.0000127738493448e-07, - "loss": 0.5488, - "num_tokens": 1548449850.0, - "reward": 1.81396484375, - "reward_std": 0.5616399049758911, - "rewards/accuracy_reward/mean": 0.07056451588869095, - "rewards/accuracy_reward/std": 0.25635457038879395, - "rewards/format_reward/mean": 0.837890625, - "rewards/format_reward/std": 0.3689115643501282, - "rewards/tag_count_reward/mean": 0.90771484375, - "rewards/tag_count_reward/std": 0.22427105903625488, + "grad_norm": 2.1725656986236572, + "kl": 4.5546875, + "learning_rate": 1.0000127835430222e-07, + "loss": 0.2092, + "num_tokens": 1695930499.0, + "reward": 1.4501953125, + "reward_std": 0.7432597875595093, + "rewards/accuracy_reward/mean": 0.07258064299821854, + "rewards/accuracy_reward/std": 0.25970885157585144, + "rewards/format_reward/mean": 0.552734375, + "rewards/format_reward/std": 0.4976975917816162, + "rewards/tag_count_reward/mean": 0.8271484375, + "rewards/tag_count_reward/std": 0.260597825050354, "step": 2928 }, { @@ -84927,42 +84927,42 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.027027027027026973, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1615.0, - "completions/mean_length": 800.9459838867188, - "completions/mean_terminated_length": 766.3055419921875, - "completions/min_length": 272.0, - "completions/min_terminated_length": 272.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1973.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 883.189208984375, + "completions/mean_terminated_length": 883.189208984375, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, "epoch": 0.9999146539216524, - "grad_norm": 1.447707176208496, - "kl": 5.875, - "learning_rate": 1.0000031934634693e-07, - "loss": 0.3851, - "num_tokens": 1548957970.0, - "reward": 1.91943359375, - "reward_std": 0.47769179940223694, - "rewards/accuracy_reward/mean": 0.091796875, - "rewards/accuracy_reward/std": 0.289021372795105, - "rewards/format_reward/mean": 0.88671875, - "rewards/format_reward/std": 0.3172462284564972, - "rewards/tag_count_reward/mean": 0.94091796875, - "rewards/tag_count_reward/std": 0.17639723420143127, + "grad_norm": 3.4660446643829346, + "kl": 4.16796875, + "learning_rate": 1.0000031958868901e-07, + "loss": 0.1983, + "num_tokens": 1696584447.0, + "reward": 1.42431640625, + "reward_std": 0.7108050584793091, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.525390625, + "rewards/format_reward/std": 0.4998432695865631, + "rewards/tag_count_reward/mean": 0.83251953125, + "rewards/tag_count_reward/std": 0.2587076425552368, "step": 2929 }, { "epoch": 0.9999146539216524, "step": 2929, "total_flos": 0.0, - "train_loss": 0.0, - "train_runtime": 1.0123, - "train_samples_per_second": 92590.82, - "train_steps_per_second": 2893.309 + "train_loss": 0.3349957286366559, + "train_runtime": 80571.461, + "train_samples_per_second": 1.163, + "train_steps_per_second": 0.036 } ], "logging_steps": 1, "max_steps": 2929, - "num_input_tokens_seen": 1548957970, + "num_input_tokens_seen": 1696584447, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": {